跳转至

Models

Create Model

mindcv.models.model_factory.create_model(model_name, num_classes=1000, pretrained=False, in_channels=3, checkpoint_path='', ema=False, auto_mapping=False, **kwargs)

Creates model by name.

PARAMETER DESCRIPTION
model_name

The name of model.

TYPE: str

num_classes

The number of classes. Default: 1000.

TYPE: int DEFAULT: 1000

pretrained

Whether to load the pretrained model. Default: False.

TYPE: bool DEFAULT: False

in_channels

The input channels. Default: 3.

TYPE: int DEFAULT: 3

checkpoint_path

The path of checkpoint files. Default: "".

TYPE: str DEFAULT: ''

ema

Whether use ema method. Default: False.

TYPE: bool DEFAULT: False

auto_mapping

Whether to automatically map the names of checkpoint weights to the names of model weights when there are differences in names. Default: False.

TYPE: bool DEFAULT: False

**kwargs

additional args, e.g., "features_only", "out_indices".

DEFAULT: {}

Source code in mindcv\models\model_factory.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def create_model(
    model_name: str,
    num_classes: int = 1000,
    pretrained: bool = False,
    in_channels: int = 3,
    checkpoint_path: str = "",
    ema: bool = False,
    auto_mapping: bool = False,
    **kwargs,
):
    r"""Creates model by name.

    Args:
        model_name (str):  The name of model.
        num_classes (int): The number of classes. Default: 1000.
        pretrained (bool): Whether to load the pretrained model. Default: False.
        in_channels (int): The input channels. Default: 3.
        checkpoint_path (str): The path of checkpoint files. Default: "".
        ema (bool): Whether use ema method. Default: False.
        auto_mapping (bool): Whether to automatically map the names of checkpoint weights
            to the names of model weights when there are differences in names. Default: False.
        **kwargs: additional args, e.g., "features_only", "out_indices".
    """

    if checkpoint_path != "" and pretrained:
        raise ValueError("checkpoint_path is mutually exclusive with pretrained")

    model_args = dict(num_classes=num_classes, pretrained=pretrained, in_channels=in_channels)
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    if not is_model(model_name):
        raise RuntimeError(f"Unknown model {model_name}")

    create_fn = model_entrypoint(model_name)
    model = create_fn(**model_args, **kwargs)

    if checkpoint_path:
        load_model_checkpoint(model, checkpoint_path, ema, auto_mapping)

    return model

bit

mindcv.models.bit

MindSpore implementation of BiT_ResNet. Refer to Big Transfer (BiT): General Visual Representation Learning.

mindcv.models.bit.BiT_ResNet

Bases: Cell

BiT_ResNet model class, based on "Big Transfer (BiT): General Visual Representation Learning" <https://arxiv.org/abs/1912.11370>_ Args: block(Union[Bottleneck]): block of BiT_ResNetv2. layers(tuple(int)): number of layers of each stage. wf(int): width of each layer. Default: 1. num_classes(int): number of classification classes. Default: 1000. in_channels(int): number the channels of the input. Default: 3. groups(int): number of groups for group conv in blocks. Default: 1. base_width(int): base width of pre group hidden channel in blocks. Default: 64. norm(nn.Cell): normalization layer in blocks. Default: None.

Source code in mindcv\models\bit.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
class BiT_ResNet(nn.Cell):
    r"""BiT_ResNet model class, based on
    `"Big Transfer (BiT): General Visual Representation Learning" <https://arxiv.org/abs/1912.11370>`_
    Args:
        block(Union[Bottleneck]): block of BiT_ResNetv2.
        layers(tuple(int)): number of layers of each stage.
        wf(int): width of each layer. Default: 1.
        num_classes(int): number of classification classes. Default: 1000.
        in_channels(int): number the channels of the input. Default: 3.
        groups(int): number of groups for group conv in blocks. Default: 1.
        base_width(int): base width of pre group hidden channel in blocks. Default: 64.
        norm(nn.Cell): normalization layer in blocks. Default: None.
    """

    def __init__(
        self,
        block: Type[Union[Bottleneck]],
        layers: List[int],
        wf: int = 1,
        num_classes: int = 1000,
        in_channels: int = 3,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()

        if norm is None:
            norm = nn.GroupNorm

        self.norm: nn.Cell = norm  # add type hints to make pylint happy
        self.input_channels = 64 * wf
        self.groups = groups
        self.base_with = base_width

        self.conv1 = StdConv2d(in_channels, self.input_channels, kernel_size=7,
                               stride=2, pad_mode="pad", padding=3)
        self.pad = nn.ConstantPad2d(1, 0)
        self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="valid")

        self.layer1 = self._make_layer(block, 64 * wf, layers[0])
        self.layer2 = self._make_layer(block, 128 * wf, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256 * wf, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512 * wf, layers[3], stride=2)

        self.gn = norm(32, 2048 * wf)
        self.relu = nn.ReLU()
        self.pool = GlobalAvgPooling(keep_dims=True)
        self.classifier = nn.Conv2d(512 * block.expansion * wf, num_classes, kernel_size=1, has_bias=True)

    def _make_layer(
        self,
        block: Type[Union[Bottleneck]],
        channels: int,
        block_nums: int,
        stride: int = 1,
    ) -> nn.SequentialCell:
        """build model depending on cfgs"""
        down_sample = None

        if stride != 1 or self.input_channels != channels * block.expansion:
            down_sample = nn.SequentialCell([
                StdConv2d(self.input_channels, channels * block.expansion, kernel_size=1, stride=stride),
            ])

        layers = []
        layers.append(
            block(
                self.input_channels,
                channels,
                stride=stride,
                down_sample=down_sample,
                groups=self.groups,
                base_width=self.base_with,
                norm=self.norm,
            )
        )
        self.input_channels = channels * block.expansion

        for _ in range(1, block_nums):
            layers.append(
                block(
                    self.input_channels,
                    channels,
                    groups=self.groups,
                    base_width=self.base_with,
                    norm=self.norm,
                )
            )

        return nn.SequentialCell(layers)

    def root(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.pad(x)
        x = self.max_pool(x)
        return x

    def forward_features(self, x: Tensor) -> Tensor:
        """Network forward feature extraction."""
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.gn(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.root(x)
        x = self.forward_features(x)
        x = self.forward_head(x)
        assert x.shape[-2:] == (1, 1)  # We should have no spatial shape left.
        return x[..., 0, 0]
mindcv.models.bit.BiT_ResNet.forward_features(x)

Network forward feature extraction.

Source code in mindcv\models\bit.py
247
248
249
250
251
252
253
def forward_features(self, x: Tensor) -> Tensor:
    """Network forward feature extraction."""
    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    return x

mindcv.models.bit.Bottleneck

Bases: Cell

define the basic block of BiT Args: in_channels(int): The channel number of the input tensor of the Conv2d layer. channels(int): The channel number of the output tensor of the middle Conv2d layer. stride(int): The movement stride of the 2D convolution kernel. Default: 1. groups(int): Number of groups for group conv in blocks. Default: 1. base_width(int): Base width of pre group hidden channel in blocks. Default: 64. norm(nn.Cell): Normalization layer in blocks. Default: None. down_sample(nn.Cell): Down sample in blocks. Default: None.

Source code in mindcv\models\bit.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class Bottleneck(nn.Cell):
    """define the basic block of BiT
    Args:
          in_channels(int): The channel number of the input tensor of the Conv2d layer.
          channels(int): The channel number of the output tensor of the middle Conv2d layer.
          stride(int): The movement stride of the 2D convolution kernel. Default: 1.
          groups(int): Number of groups for group conv in blocks. Default: 1.
          base_width(int): Base width of pre group hidden channel in blocks. Default: 64.
          norm(nn.Cell): Normalization layer in blocks. Default: None.
          down_sample(nn.Cell): Down sample in blocks. Default: None.
    """

    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        stride: int = 1,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        down_sample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.GroupNorm

        width = int(channels * (base_width / 64.0)) * groups
        self.gn1 = norm(32, in_channels)
        self.conv1 = StdConv2d(in_channels, width, kernel_size=1, stride=1)
        self.gn2 = norm(32, width)
        self.conv2 = StdConv2d(width, width, kernel_size=3, stride=stride,
                               padding=1, pad_mode="pad", group=groups)
        self.gn3 = norm(32, width)
        self.conv3 = StdConv2d(width, channels * self.expansion,
                               kernel_size=1, stride=1)

        self.relu = nn.ReLU()
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x
        out = self.gn1(x)
        out = self.relu(out)

        residual = out

        out = self.conv1(out)

        out = self.gn2(out)
        out = self.relu(out)
        out = self.conv2(out)

        out = self.gn3(out)
        out = self.relu(out)
        out = self.conv3(out)

        if self.down_sample is not None:
            identity = self.down_sample(residual)

        out += identity
        # out = self.relu(out)

        return out

mindcv.models.bit.StdConv2d

Bases: Conv2d

Conv2d with Weight Standardization Args: in_channels(int): The channel number of the input tensor of the Conv2d layer. out_channels(int): The channel number of the output tensor of the Conv2d layer. kernel_size(int): Specifies the height and width of the 2D convolution kernel. stride(int): The movement stride of the 2D convolution kernel. Default: 1. pad_mode(str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same". padding(int): The number of padding on the height and width directions of the input. Default: 0. group(int): Splits filter into groups. Default: 1.

Source code in mindcv\models\bit.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class StdConv2d(nn.Conv2d):
    r"""Conv2d with Weight Standardization
    Args:
        in_channels(int): The channel number of the input tensor of the Conv2d layer.
        out_channels(int): The channel number of the output tensor of the Conv2d layer.
        kernel_size(int): Specifies the height and width of the 2D convolution kernel.
        stride(int): The movement stride of the 2D convolution kernel. Default: 1.
        pad_mode(str): Specifies padding mode. The optional values are "same", "valid", "pad". Default: "same".
        padding(int): The number of padding on the height and width directions of the input. Default: 0.
        group(int): Splits filter into groups. Default: 1.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride=1,
        pad_mode="same",
        padding=0,
        group=1,
    ) -> None:
        super(StdConv2d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            group,
        )
        self.mean_op = ops.ReduceMean(keep_dims=True)

    def construct(self, x):
        w = self.weight
        m = self.mean_op(w, [1, 2, 3])
        v = w.var((1, 2, 3), keepdims=True)
        w = (w - m) / mindspore.ops.sqrt(v + 1e-10)
        output = self.conv2d(x, w)
        return output

mindcv.models.bit.BiT_resnet101(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 101 layers ResNet model. Refer to the base class models.BiT_Resnet for more details.

Source code in mindcv\models\bit.py
298
299
300
301
302
303
304
305
306
307
308
309
@register_model
def BiT_resnet101(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 101 layers ResNet model.
    Refer to the base class `models.BiT_Resnet` for more details.
    """
    default_cfg = default_cfgs["BiT_resnet101"]
    model = BiT_ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.bit.BiT_resnet50(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers ResNet model. Refer to the base class models.BiT_Resnet for more details.

Source code in mindcv\models\bit.py
270
271
272
273
274
275
276
277
278
279
280
281
@register_model
def BiT_resnet50(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 50 layers ResNet model.
    Refer to the base class `models.BiT_Resnet` for more details.
    """
    default_cfg = default_cfgs["BiT_resnet50"]
    model = BiT_ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.bit.BiT_resnet50x3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers ResNet model. Refer to the base class models.BiT_Resnet for more details.

Source code in mindcv\models\bit.py
284
285
286
287
288
289
290
291
292
293
294
295
@register_model
def BiT_resnet50x3(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 50 layers ResNet model.
     Refer to the base class `models.BiT_Resnet` for more details.
     """
    default_cfg = default_cfgs["BiT_resnet50x3"]
    model = BiT_ResNet(Bottleneck, [3, 4, 6, 3], wf=3, num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

cait

mindcv.models.cait

MindSpore implementation of CaiT. Refer to Going deeper with Image Transformers.

mindcv.models.cait.AttentionTalkingHead

Bases: Cell

Talking head is a trick for multi-head attention, which has two more linear map before and after the softmax compared to normal attention.

Source code in mindcv\models\cait.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
class AttentionTalkingHead(nn.Cell):
    """
    Talking head is a trick for multi-head attention,
    which has two more linear map before and after
    the softmax compared to normal attention.
    """
    def __init__(self,
                 dim: int,
                 num_heads: int = 8,
                 qkv_bias: bool = False,
                 qk_scale: float = None,
                 attn_drop_rate: float = 0.,
                 proj_drop_rate: float = 0.) -> None:
        super(AttentionTalkingHead, self).__init__()
        assert dim % num_heads == 0, "dim should be divisible by num_heads."
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop_rate)

        self.proj = nn.Dense(dim, dim, has_bias=False)

        self.proj_l = nn.Dense(num_heads, num_heads, has_bias=False)
        self.proj_w = nn.Dense(num_heads, num_heads, has_bias=False)

        self.proj_drop = Dropout(p=proj_drop_rate)

        self.softmax = nn.Softmax(axis=-1)

        self.attn_matmul_v = ops.BatchMatMul()
        self.q_matmul_k = ops.BatchMatMul(transpose_b=True)

    def construct(self, x) -> Tensor:
        B, N, C = x.shape
        qkv = ops.reshape(self.qkv(x), (B, N, 3, self.num_heads, C // self.num_heads))
        qkv = ops.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = ops.unstack(qkv, axis=0)
        q = ops.mul(q, self.scale)

        attn = self.q_matmul_k(q, k)

        attn = ops.transpose(attn, (0, 2, 3, 1))
        attn = self.proj_l(attn)
        attn = ops.transpose(attn, (0, 3, 1, 2))
        attn = self.softmax(attn)
        attn = ops.transpose(attn, (0, 2, 3, 1))
        attn = self.proj_w(attn)
        attn = ops.transpose(attn, (0, 3, 1, 2))

        attn = self.attn_drop(attn)

        x = self.attn_matmul_v(attn, v)
        x = ops.transpose(x, (0, 2, 1, 3))
        x = ops.reshape(x, (B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)

        return x

cmt

mindcv.models.cmt

mindcv.models.cmt.PatchEmbed

Bases: Cell

Image to Patch Embedding

Source code in mindcv\models\cmt.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class PatchEmbed(nn.Cell):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * \
                      (img_size[0] // patch_size[0])

        assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \
            f"img_size {img_size} should be divided by patch_size {patch_size}."

        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, has_bias=True)
        self.norm = nn.LayerNorm([embed_dim])

    def construct(self, x):
        _, _, H, W = x.shape

        assert H == self.img_size[0] and W == self.img_size[1], \
            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x)
        _B, _C, _H, _W = x.shape
        x = ops.transpose(x.reshape(_B, _C, _H * _W), (0, 2, 1))
        x = self.norm(x)

        H, W = H // self.patch_size[0], W // self.patch_size[1]
        return x, (H, W)

mindcv.models.cmt.cmt_base(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

CMT-Base

Source code in mindcv\models\cmt.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
@register_model
def cmt_base(pretrained=False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """
    CMT-Base
    """
    default_cfg = default_cfgs["cmt_base"]

    model = CMT(img_size=256, num_classes=num_classes, in_channels=in_channels, qkv_bias=True,
                embed_dims=[76, 152, 304, 608], stem_channel=38, num_heads=[1, 2, 4, 8], depths=[4, 4, 20, 4],
                mlp_ratios=[4, 4, 4, 4], qk_ratio=1, sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.cmt.cmt_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

CMT-Small

Source code in mindcv\models\cmt.py
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
@register_model
def cmt_small(pretrained=False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """
    CMT-Small
    """
    default_cfg = default_cfgs["cmt_small"]

    model = CMT(img_size=224, num_classes=num_classes, in_channels=in_channels, qkv_bias=True,
                embed_dims=[64, 128, 256, 512], stem_channel=32, num_heads=[1, 2, 4, 8], depths=[3, 3, 16, 3],
                mlp_ratios=[4, 4, 4, 4], qk_ratio=1, sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.cmt.cmt_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

CMT-tiny

Source code in mindcv\models\cmt.py
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
@register_model
def cmt_tiny(pretrained=False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """
    CMT-tiny
    """
    default_cfg = default_cfgs["cmt_tiny"]

    model = CMT(img_size=160, num_classes=num_classes, in_channels=in_channels, qkv_bias=True,
                embed_dims=[46, 92, 184, 368], stem_channel=16, num_heads=[1, 2, 4, 8], depths=[2, 2, 10, 2],
                mlp_ratios=[3.6, 3.6, 3.6, 3.6], qk_ratio=1, sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.cmt.cmt_xsmall(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

CMT-XSmall

Source code in mindcv\models\cmt.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
@register_model
def cmt_xsmall(pretrained=False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """
    CMT-XSmall
    """
    default_cfg = default_cfgs["cmt_xsmall"]

    model = CMT(img_size=192, num_classes=num_classes, in_channels=in_channels, qkv_bias=True,
                embed_dims=[52, 104, 208, 416], stem_channel=16, num_heads=[1, 2, 4, 8], depths=[3, 3, 12, 3],
                mlp_ratios=[3.8, 3.8, 3.8, 3.8], qk_ratio=1, sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

coat

mindcv.models.coat

CoaT architecture. Modified from timm/models/vision_transformer.py

mindcv.models.coat.CoaT

Bases: Cell

CoaT class.

Source code in mindcv\models\coat.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
class CoaT(nn.Cell):
    """ CoaT class. """

    def __init__(
        self,
        image_size=224,
        patch_size=16,
        in_chans=3,
        num_classes=1000,
        embed_dims=[0, 0, 0, 0],
        serial_depths=[0, 0, 0, 0],
        parallel_depth=0,
        num_heads=0,
        mlp_ratios=[0, 0, 0, 0],
        qkv_bias=True,
        drop_rate=0.,
        attn_drop_rate=0.,
        drop_path_rate=0.,
        return_interm_layers=False,
        out_features=None,
        crpe_window={3: 2, 5: 3, 7: 3},
        **kwargs
    ) -> None:
        super().__init__()
        self.return_interm_layers = return_interm_layers
        self.out_features = out_features
        self.num_classes = num_classes

        self.patch_embed1 = PatchEmbed(image_size=image_size, patch_size=patch_size,
                                       in_chans=in_chans, embed_dim=embed_dims[0])
        self.patch_embed2 = PatchEmbed(image_size=image_size // (2**2), patch_size=2,
                                       in_chans=embed_dims[0], embed_dim=embed_dims[1])
        self.patch_embed3 = PatchEmbed(image_size=image_size // (2**3), patch_size=2,
                                       in_chans=embed_dims[1], embed_dim=embed_dims[2])
        self.patch_embed4 = PatchEmbed(image_size=image_size // (2**4), patch_size=2,
                                       in_chans=embed_dims[2], embed_dim=embed_dims[3])

        self.cls_token1 = mindspore.Parameter(ops.Zeros()((1, 1, embed_dims[0]), mindspore.float32))
        self.cls_token2 = mindspore.Parameter(ops.Zeros()((1, 1, embed_dims[1]), mindspore.float32))
        self.cls_token3 = mindspore.Parameter(ops.Zeros()((1, 1, embed_dims[2]), mindspore.float32))
        self.cls_token4 = mindspore.Parameter(ops.Zeros()((1, 1, embed_dims[3]), mindspore.float32))

        self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3)
        self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3)
        self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3)
        self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3)

        self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads, h=num_heads, window=crpe_window)
        self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads, h=num_heads, window=crpe_window)
        self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads, h=num_heads, window=crpe_window)
        self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads, h=num_heads, window=crpe_window)

        dpr = drop_path_rate

        self.serial_blocks1 = nn.CellList([
            SerialBlock(
                dim=embed_dims[0], num_heads=num_heads, mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr,
                shared_cpe=self.cpe1, shared_crpe=self.crpe1
            )
            for _ in range(serial_depths[0])]
        )

        self.serial_blocks2 = nn.CellList([
            SerialBlock(
                dim=embed_dims[1], num_heads=num_heads, mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr,
                shared_cpe=self.cpe2, shared_crpe=self.crpe2
            )
            for _ in range(serial_depths[1])]
        )

        self.serial_blocks3 = nn.CellList([
            SerialBlock(
                dim=embed_dims[2], num_heads=num_heads, mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr,
                shared_cpe=self.cpe3, shared_crpe=self.crpe3
            )
            for _ in range(serial_depths[2])]
        )

        self.serial_blocks4 = nn.CellList([
            SerialBlock(
                dim=embed_dims[3], num_heads=num_heads, mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr,
                shared_cpe=self.cpe4, shared_crpe=self.crpe4
            )
            for _ in range(serial_depths[3])]
        )

        self.parallel_depth = parallel_depth
        if self.parallel_depth > 0:
            self.parallel_blocks = nn.CellList([
                ParallelBlock(dims=embed_dims,
                              num_heads=num_heads,
                              mlp_ratios=mlp_ratios,
                              qkv_bias=qkv_bias,
                              drop=drop_rate,
                              attn_drop=attn_drop_rate,
                              drop_path=dpr,
                              shared_cpes=[self.cpe1, self.cpe2, self.cpe3, self.cpe4],
                              shared_crpes=[self.crpe1, self.crpe2, self.crpe3, self.crpe4]
                              )
                for _ in range(parallel_depth)]
            )
        else:
            self.parallel_blocks = None

        if not self.return_interm_layers:
            if self.parallel_blocks is not None:
                self.norm2 = nn.LayerNorm((embed_dims[1],), epsilon=1e-6)
                self.norm3 = nn.LayerNorm((embed_dims[2],), epsilon=1e-6)
            else:
                self.norm2 = None
                self.norm3 = None

            self.norm4 = nn.LayerNorm((embed_dims[3],), epsilon=1e-6)

            if self.parallel_depth > 0:
                self.aggregate = nn.Conv1d(in_channels=3,
                                           out_channels=1,
                                           kernel_size=1,
                                           has_bias=True)
                self.head = nn.Dense(embed_dims[3], num_classes) if num_classes > 0 else Identity()
            else:
                self.aggregate = None
                self.head = nn.Dense(embed_dims[3], num_classes) if num_classes > 0 else Identity()

        self.cls_token1.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.cls_token1.data.shape))
        self.cls_token2.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.cls_token2.data.shape))
        self.cls_token3.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.cls_token3.data.shape))
        self.cls_token4.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.cls_token4.data.shape))
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.data.shape))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Constant(0), cell.bias.shape))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer(init.Constant(1.0), cell.gamma.shape))
                cell.beta.set_data(init.initializer(init.Constant(0), cell.beta.shape))

    def insert_cls(self, x, cls_token) -> Tensor:
        t0 = x.shape[0]
        t1 = cls_token.shape[1]
        t2 = cls_token.shape[2]
        y = Tensor(np.ones((t0, t1, t2)))
        cls_tokens = cls_token.expand_as(y)

        x = ops.concat((cls_tokens, x), axis=1)
        return x

    def remove_cls(self, x: Tensor) -> Tensor:
        return x[:, 1:, :]

    def forward_features(self, x0: Tensor) -> Union[dict, Tensor]:
        B = x0.shape[0]

        x1 = self.patch_embed1(x0)
        H1, W1 = self.patch_embed1.patches_resolution
        x1 = self.insert_cls(x1, self.cls_token1)
        for blk in self.serial_blocks1:
            x1 = blk(x1, size=(H1, W1))
        x1_nocls = self.remove_cls(x1)
        x1_nocls = ops.reshape(x1_nocls, (B, H1, W1, -1))
        x1_nocls = ops.transpose(x1_nocls, (0, 3, 1, 2))

        x2 = self.patch_embed2(x1_nocls)
        H2, W2 = self.patch_embed2.patches_resolution
        x2 = self.insert_cls(x2, self.cls_token2)
        for blk in self.serial_blocks2:
            x2 = blk(x2, size=(H2, W2))
        x2_nocls = self.remove_cls(x2)
        x2_nocls = ops.reshape(x2_nocls, (B, H2, W2, -1))
        x2_nocls = ops.transpose(x2_nocls, (0, 3, 1, 2))

        x3 = self.patch_embed3(x2_nocls)
        H3, W3 = self.patch_embed3.patches_resolution
        x3 = self.insert_cls(x3, self.cls_token3)
        for blk in self.serial_blocks3:
            x3 = blk(x3, size=(H3, W3))
        x3_nocls = self.remove_cls(x3)
        x3_nocls = ops.reshape(x3_nocls, (B, H3, W3, -1))
        x3_nocls = ops.transpose(x3_nocls, (0, 3, 1, 2))

        x4 = self.patch_embed4(x3_nocls)
        H4, W4 = self.patch_embed4.patches_resolution
        x4 = self.insert_cls(x4, self.cls_token4)
        for blk in self.serial_blocks4:
            x4 = blk(x4, size=(H4, W4))
        x4_nocls = self.remove_cls(x4)
        x4_nocls = ops.reshape(x4_nocls, (B, H4, W4, -1))
        x4_nocls = ops.transpose(x4_nocls, (0, 3, 1, 2))

        if self.parallel_depth == 0:
            if self.return_interm_layers:
                feat_out = {}
                if 'x1_nocls' in self.out_features:
                    feat_out['x1_nocls'] = x1_nocls
                if 'x2_nocls' in self.out_features:
                    feat_out['x2_nocls'] = x2_nocls
                if 'x3_nocls' in self.out_features:
                    feat_out['x3_nocls'] = x3_nocls
                if 'x4_nocls' in self.out_features:
                    feat_out['x4_nocls'] = x4_nocls
                return feat_out
            else:
                x4 = self.norm4(x4)
                x4_cls = x4[:, 0]
                return x4_cls

        for blk in self.parallel_blocks:
            x1, x2, x3, x4 = blk(x1, x2, x3, x4, sizes=[(H1, W1), (H2, W2), (H3, W3), (H4, W4)])

        if self.return_interm_layers:
            feat_out = {}
            if 'x1_nocls' in self.out_features:
                x1_nocls = x1[:, 1:, :].reshape((B, H1, W1, -1)).transpose((0, 3, 1, 2))
                feat_out['x1_nocls'] = x1_nocls
            if 'x2_nocls' in self.out_features:
                x2_nocls = x2[:, 1:, :].reshape((B, H2, W2, -1)).transpose((0, 3, 1, 2))
                feat_out['x2_nocls'] = x2_nocls
            if 'x3_nocls' in self.out_features:
                x3_nocls = x3[:, 1:, :].reshape((B, H3, W3, -1)).transpose((0, 3, 1, 2))
                feat_out['x3_nocls'] = x3_nocls
            if 'x4_nocls' in self.out_features:
                x4_nocls = x4[:, 1:, :].reshape((B, H4, W4, -1)).transpose((0, 3, 1, 2))
                feat_out['x4_nocls'] = x4_nocls
            return feat_out
        else:
            x2 = self.norm2(x2)
            x3 = self.norm3(x3)
            x4 = self.norm4(x4)
            x2_cls = x2[:, :1]
            x3_cls = x3[:, :1]
            x4_cls = x4[:, :1]
            merged_cls = ops.concat((x2_cls, x3_cls, x4_cls), axis=1)
            merged_cls = self.aggregate(merged_cls).squeeze(axis=1)
            return merged_cls

    def construct(self, x: Tensor) -> Union[dict, Tensor]:
        if self.return_interm_layers:
            return self.forward_features(x)
        else:
            x = self.forward_features(x)
            x = self.head(x)
            return x

mindcv.models.coat.ConvPosEnc

Bases: Cell

Convolutional Position Encoding. Note: This module is similar to the conditional position encoding in CPVT.

Source code in mindcv\models\coat.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
class ConvPosEnc(nn.Cell):
    """ Convolutional Position Encoding.
        Note: This module is similar to the conditional position encoding in CPVT.
    """

    def __init__(
        self,
        dim,
        k=3
    ) -> None:
        super(ConvPosEnc, self).__init__()
        self.proj = nn.Conv2d(in_channels=dim,
                              out_channels=dim,
                              kernel_size=k,
                              stride=1,
                              padding=k // 2,
                              group=dim,
                              pad_mode='pad',
                              has_bias=True)

    def construct(self, x, size) -> Tensor:
        B, N, C = x.shape
        H, W = size

        cls_token, img_tokens = x[:, :1], x[:, 1:]

        feat = ops.transpose(img_tokens, (0, 2, 1))
        feat = ops.reshape(feat, (B, C, H, W))
        x = ops.add(self.proj(feat), feat)

        x = ops.reshape(x, (B, C, H * W))
        x = ops.transpose(x, (0, 2, 1))

        x = ops.concat((cls_token, x), axis=1)
        return x

mindcv.models.coat.FactorAtt_ConvRelPosEnc

Bases: Cell

Factorized attention with convolutional relative position encoding class.

Source code in mindcv\models\coat.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
class FactorAtt_ConvRelPosEnc(nn.Cell):
    """Factorized attention with convolutional relative position encoding class."""

    def __init__(
        self,
        dim,
        num_heads=8,
        qkv_bias=False,
        attn_drop=0.,
        proj_drop=0.,
        shared_crpe=None
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.softmax = nn.Softmax(axis=-1)
        self.batch_matmul = ops.BatchMatMul()

        self.crpe = shared_crpe

    def construct(self, x, size) -> Tensor:
        B, N, C = x.shape
        q = ops.reshape(self.q(x), (B, N, self.num_heads, C // self.num_heads))
        q = ops.transpose(q, (0, 2, 1, 3))
        k = ops.reshape(self.k(x), (B, N, self.num_heads, C // self.num_heads))
        k = ops.transpose(k, (0, 2, 3, 1))
        v = ops.reshape(self.v(x), (B, N, self.num_heads, C // self.num_heads))
        v = ops.transpose(v, (0, 2, 1, 3))

        k_softmax = self.softmax(k)
        factor_att = self.batch_matmul(q, k_softmax)
        factor_att = self.batch_matmul(factor_att, v)

        crpe = self.crpe(q, v, size=size)

        x = ops.mul(self.scale, factor_att)
        x = ops.add(x, crpe)
        x = ops.transpose(x, (0, 2, 1, 3))
        x = ops.reshape(x, (B, N, C))

        x = self.proj(x)
        x = self.proj_drop(x)
        return x

mindcv.models.coat.Mlp

Bases: Cell

MLP Cell

Source code in mindcv\models\coat.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class Mlp(nn.Cell):
    """MLP Cell"""

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        drop=0.0
    ) -> None:
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True)
        self.act = nn.GELU(approximate=False)
        self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True)
        self.drop = Dropout(p=drop)

    def construct(self, x: Tensor) -> Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

mindcv.models.coat.ParallelBlock

Bases: Cell

Parallel block class.

Source code in mindcv\models\coat.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
class ParallelBlock(nn.Cell):
    """ Parallel block class. """

    def __init__(
        self,
        dims,
        num_heads,
        mlp_ratios=[],
        qkv_bias=False,
        drop=0.,
        attn_drop=0.,
        drop_path=0.,
        shared_cpes=None,
        shared_crpes=None
    ) -> None:
        super().__init__()

        self.cpes = shared_cpes

        self.norm12 = nn.LayerNorm((dims[1],), epsilon=1e-6)
        self.norm13 = nn.LayerNorm((dims[2],), epsilon=1e-6)
        self.norm14 = nn.LayerNorm((dims[3],), epsilon=1e-6)
        self.factoratt_crpe2 = FactorAtt_ConvRelPosEnc(dims[1],
                                                       num_heads=num_heads,
                                                       qkv_bias=qkv_bias,
                                                       attn_drop=attn_drop,
                                                       proj_drop=drop,
                                                       shared_crpe=shared_crpes[1]
                                                       )
        self.factoratt_crpe3 = FactorAtt_ConvRelPosEnc(dims[2],
                                                       num_heads=num_heads,
                                                       qkv_bias=qkv_bias,
                                                       attn_drop=attn_drop,
                                                       proj_drop=drop,
                                                       shared_crpe=shared_crpes[2]
                                                       )
        self.factoratt_crpe4 = FactorAtt_ConvRelPosEnc(dims[3],
                                                       num_heads=num_heads,
                                                       qkv_bias=qkv_bias,
                                                       attn_drop=attn_drop,
                                                       proj_drop=drop,
                                                       shared_crpe=shared_crpes[3]
                                                       )
        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
        self.interpolate_fn = Interpolate(mode="bilinear", align_corners=True)

        self.norm22 = nn.LayerNorm((dims[1],), epsilon=1e-6)
        self.norm23 = nn.LayerNorm((dims[2],), epsilon=1e-6)
        self.norm24 = nn.LayerNorm((dims[3],), epsilon=1e-6)

        mlp_hidden_dim = int(dims[1] * mlp_ratios[1])
        self.mlp2 = self.mlp3 = self.mlp4 = Mlp(in_features=dims[1], hidden_features=mlp_hidden_dim, drop=drop)

    def upsample(self, x, output_size, size) -> Tensor:
        """ Feature map up-sampling. """
        return self.interpolate(x, output_size=output_size, size=size)

    def downsample(self, x, output_size, size) -> Tensor:
        """ Feature map down-sampling. """
        return self.interpolate(x, output_size=output_size, size=size)

    def interpolate(self, x, output_size, size) -> Tensor:
        """ Feature map interpolation. """
        B, N, C = x.shape
        H, W = size

        cls_token = x[:, :1, :]
        img_tokens = x[:, 1:, :]

        img_tokens = ops.transpose(img_tokens, (0, 2, 1))
        img_tokens = ops.reshape(img_tokens, (B, C, H, W))
        img_tokens = self.interpolate_fn(img_tokens, size=output_size)
        img_tokens = ops.reshape(img_tokens, (B, C, -1))
        img_tokens = ops.transpose(img_tokens, (0, 2, 1))

        out = ops.concat((cls_token, img_tokens), axis=1)
        return out

    def construct(self, x1, x2, x3, x4, sizes) -> tuple:
        _, (H2, W2), (H3, W3), (H4, W4) = sizes

        # Conv-Attention.
        x2 = self.cpes[1](x2, size=(H2, W2))  # Note: x1 is ignored.
        x3 = self.cpes[2](x3, size=(H3, W3))
        x4 = self.cpes[3](x4, size=(H4, W4))

        cur2 = self.norm12(x2)
        cur3 = self.norm13(x3)
        cur4 = self.norm14(x4)
        cur2 = self.factoratt_crpe2(cur2, size=(H2, W2))
        cur3 = self.factoratt_crpe3(cur3, size=(H3, W3))
        cur4 = self.factoratt_crpe4(cur4, size=(H4, W4))
        upsample3_2 = self.upsample(cur3, output_size=(H2, W2), size=(H3, W3))
        upsample4_3 = self.upsample(cur4, output_size=(H3, W3), size=(H4, W4))
        upsample4_2 = self.upsample(cur4, output_size=(H2, W2), size=(H4, W4))
        downsample2_3 = self.downsample(cur2, output_size=(H3, W3), size=(H2, W2))
        downsample3_4 = self.downsample(cur3, output_size=(H4, W4), size=(H3, W3))
        downsample2_4 = self.downsample(cur2, output_size=(H4, W4), size=(H2, W2))
        cur2 = cur2 + upsample3_2 + upsample4_2
        cur3 = cur3 + upsample4_3 + downsample2_3
        cur4 = cur4 + downsample3_4 + downsample2_4
        x2 = x2 + self.drop_path(cur2)
        x3 = x3 + self.drop_path(cur3)
        x4 = x4 + self.drop_path(cur4)

        cur2 = self.norm22(x2)
        cur3 = self.norm23(x3)
        cur4 = self.norm24(x4)
        cur2 = self.mlp2(cur2)
        cur3 = self.mlp3(cur3)
        cur4 = self.mlp4(cur4)
        x2 = x2 + self.drop_path(cur2)
        x3 = x3 + self.drop_path(cur3)
        x4 = x4 + self.drop_path(cur4)

        return x1, x2, x3, x4
mindcv.models.coat.ParallelBlock.downsample(x, output_size, size)

Feature map down-sampling.

Source code in mindcv\models\coat.py
339
340
341
def downsample(self, x, output_size, size) -> Tensor:
    """ Feature map down-sampling. """
    return self.interpolate(x, output_size=output_size, size=size)
mindcv.models.coat.ParallelBlock.interpolate(x, output_size, size)

Feature map interpolation.

Source code in mindcv\models\coat.py
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def interpolate(self, x, output_size, size) -> Tensor:
    """ Feature map interpolation. """
    B, N, C = x.shape
    H, W = size

    cls_token = x[:, :1, :]
    img_tokens = x[:, 1:, :]

    img_tokens = ops.transpose(img_tokens, (0, 2, 1))
    img_tokens = ops.reshape(img_tokens, (B, C, H, W))
    img_tokens = self.interpolate_fn(img_tokens, size=output_size)
    img_tokens = ops.reshape(img_tokens, (B, C, -1))
    img_tokens = ops.transpose(img_tokens, (0, 2, 1))

    out = ops.concat((cls_token, img_tokens), axis=1)
    return out
mindcv.models.coat.ParallelBlock.upsample(x, output_size, size)

Feature map up-sampling.

Source code in mindcv\models\coat.py
335
336
337
def upsample(self, x, output_size, size) -> Tensor:
    """ Feature map up-sampling. """
    return self.interpolate(x, output_size=output_size, size=size)

mindcv.models.coat.PatchEmbed

Bases: Cell

Image to Patch Embedding

Source code in mindcv\models\coat.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
class PatchEmbed(nn.Cell):
    """ Image to Patch Embedding """

    def __init__(
        self,
        image_size=224,
        patch_size=4,
        in_chans=3,
        embed_dim=96
    ) -> None:
        super().__init__()
        image_size = (image_size, image_size)
        patch_size = (patch_size, patch_size)
        patches_resolution = [image_size[0] // patch_size[0], image_size[1] // patch_size[1]]

        self.image_size = image_size
        self.patch_size = patch_size
        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(in_channels=in_chans,
                              out_channels=embed_dim,
                              kernel_size=patch_size,
                              stride=patch_size,
                              pad_mode='valid',
                              has_bias=True)

        self.norm = nn.LayerNorm((embed_dim,), epsilon=1e-5)

    def construct(self, x: Tensor) -> Tensor:
        B = x.shape[0]

        x = ops.reshape(self.proj(x), (B, self.embed_dim, -1))
        x = ops.transpose(x, (0, 2, 1))
        x = self.norm(x)

        return x

mindcv.models.coat.SerialBlock

Bases: Cell

Serial block class. Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module.

Source code in mindcv\models\coat.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
class SerialBlock(nn.Cell):
    """
    Serial block class.
        Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module.
    """

    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.,
        qkv_bias=False,
        drop=0.,
        attn_drop=0.,
        drop_path=0.,
        shared_cpe=None,
        shared_crpe=None
    ) -> None:
        super().__init__()

        self.cpe = shared_cpe

        self.norm1 = nn.LayerNorm((dim,), epsilon=1e-6)
        self.factoratt_crpe = FactorAtt_ConvRelPosEnc(dim,
                                                      num_heads=num_heads,
                                                      qkv_bias=qkv_bias,
                                                      attn_drop=attn_drop,
                                                      proj_drop=drop,
                                                      shared_crpe=shared_crpe
                                                      )
        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()

        self.norm2 = nn.LayerNorm((dim,), epsilon=1e-6)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, drop=drop)

    def construct(self, x, size) -> Tensor:
        x = x + self.drop_path(self.factoratt_crpe(self.norm1(self.cpe(x, size)), size))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

convit

mindcv.models.convit

MindSpore implementation of ConViT. Refer to ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases

mindcv.models.convit.Block

Bases: Cell

Basic module of ConViT

Source code in mindcv\models\convit.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class Block(nn.Cell):
    """Basic module of ConViT"""

    def __init__(
        self,
        dim: int,
        num_heads: int,
        mlp_ratio: float,
        qkv_bias: bool = False,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        drop_path: float = 0.0,
        use_gpsa: bool = True,
        **kwargs
    ) -> None:
        super().__init__()

        self.norm1 = nn.LayerNorm((dim,))
        if use_gpsa:
            self.attn = GPSA(dim, num_heads=num_heads, qkv_bias=qkv_bias,
                             attn_drop=attn_drop, proj_drop=drop, **kwargs)
        else:
            self.attn = MHSA(dim, num_heads=num_heads, qkv_bias=qkv_bias,
                             attn_drop=attn_drop, proj_drop=drop, **kwargs)
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = nn.LayerNorm((dim,))
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU, drop=drop)

    def construct(self, x: Tensor) -> Tensor:
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.convit.ConViT

Bases: Cell

ConViT model class, based on '"Improving Vision Transformers with Soft Convolutional Inductive Biases" https://arxiv.org/pdf/2103.10697.pdf'

PARAMETER DESCRIPTION
in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

image_size

images input size. Default: 224.

TYPE: int) DEFAULT: 224

patch_size

image patch size. Default: 16.

TYPE: int) DEFAULT: 16

embed_dim

embedding dimension in all head. Default: 48.

TYPE: int) DEFAULT: 48

num_heads

number of heads. Default: 12.

TYPE: int) DEFAULT: 12

drop_rate

dropout rate. Default: 0.

TYPE: float) DEFAULT: 0.0

drop_path_rate

drop path rate. Default: 0.1.

TYPE: float) DEFAULT: 0.1

depth

model block depth. Default: 12.

TYPE: int) DEFAULT: 12

mlp_ratio

ratio of hidden features in Mlp. Default: 4.

TYPE: float) DEFAULT: 4.0

qkv_bias

have bias in qkv layers or not. Default: False.

TYPE: bool) DEFAULT: False

attn_drop_rate

attention layers dropout rate. Default: 0.

TYPE: float) DEFAULT: 0.0

locality_strength

determines how focused each head is around its attention center. Default: 1.

TYPE: float) DEFAULT: 1.0

local_up_to_layer

number of GPSA layers. Default: 10.

TYPE: int) DEFAULT: 10

use_pos_embed

whether use the embeded position. Default: True.

TYPE: bool DEFAULT: True

locality_strength(float)

the strength of locality. Default: 1.

Source code in mindcv\models\convit.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
class ConViT(nn.Cell):
    r"""ConViT model class, based on
    '"Improving Vision Transformers with Soft Convolutional Inductive Biases"
    <https://arxiv.org/pdf/2103.10697.pdf>'

    Args:
        in_channels (int): number the channels of the input. Default: 3.
        num_classes (int) : number of classification classes. Default: 1000.
        image_size (int) : images input size. Default: 224.
        patch_size (int) : image patch size. Default: 16.
        embed_dim (int) : embedding dimension in all head. Default: 48.
        num_heads (int) : number of heads. Default: 12.
        drop_rate (float) : dropout rate. Default: 0.
        drop_path_rate (float) : drop path rate. Default: 0.1.
        depth (int) : model block depth. Default: 12.
        mlp_ratio (float) : ratio of hidden features in Mlp. Default: 4.
        qkv_bias (bool) : have bias in qkv layers or not. Default: False.
        attn_drop_rate (float) : attention layers dropout rate. Default: 0.
        locality_strength (float) : determines how focused each head is around its attention center. Default: 1.
        local_up_to_layer (int) : number of GPSA layers. Default: 10.
        use_pos_embed (bool): whether use the embeded position.  Default: True.
        locality_strength(float): the strength of locality. Default: 1.
    """

    def __init__(
        self,
        in_channels: int = 3,
        num_classes: int = 1000,
        image_size: int = 224,
        patch_size: int = 16,
        embed_dim: int = 48,
        num_heads: int = 12,
        drop_rate: float = 0.0,
        drop_path_rate: float = 0.1,
        depth: int = 12,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        attn_drop_rate: float = 0.0,
        local_up_to_layer: int = 10,
        use_pos_embed: bool = True,
        locality_strength: float = 1.0,
    ) -> None:
        super().__init__()

        self.local_up_to_layer = local_up_to_layer
        self.use_pos_embed = use_pos_embed
        self.num_heads = num_heads
        self.locality_strength = locality_strength
        self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            image_size=image_size, patch_size=patch_size, in_chans=in_channels, embed_dim=embed_dim)
        self.num_patches = self.patch_embed.num_patches

        self.cls_token = Parameter(ops.Zeros()((1, 1, embed_dim), ms.float32))
        self.pos_drop = Dropout(p=drop_rate)

        if self.use_pos_embed:
            self.pos_embed = Parameter(ops.Zeros()((1, self.num_patches, embed_dim), ms.float32))
            self.pos_embed.set_data(init.initializer(init.TruncatedNormal(sigma=0.02), self.pos_embed.data.shape))

        dpr = [x.item() for x in np.linspace(0, drop_path_rate, depth)]
        self.blocks = nn.CellList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                use_gpsa=True)
            if i < local_up_to_layer else
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                use_gpsa=False)
            for i in range(depth)])
        self.norm = nn.LayerNorm((embed_dim,))

        self.classifier = nn.Dense(in_channels=embed_dim, out_channels=num_classes) if num_classes > 0 else Identity()
        self.cls_token.set_data(init.initializer(init.TruncatedNormal(sigma=0.02), self.cls_token.data.shape))
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=0.02), cell.weight.data.shape))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Constant(0), cell.bias.shape))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer(init.Constant(1), cell.gamma.shape))
                cell.beta.set_data(init.initializer(init.Constant(0), cell.beta.shape))
        # local init
        for i in range(self.local_up_to_layer):
            self.blocks[i].attn.v.weight.set_data(ops.eye(self.embed_dim, self.embed_dim, ms.float32), slice_shape=True)
            locality_distance = 1
            kernel_size = int(self.num_heads**0.5)
            center = (kernel_size - 1) / 2 if kernel_size % 2 == 0 else kernel_size // 2
            pos_weight_data = self.blocks[i].attn.pos_proj.weight.data
            for h1 in range(kernel_size):
                for h2 in range(kernel_size):
                    position = h1 + kernel_size * h2
                    pos_weight_data[position, 2] = -1
                    pos_weight_data[position, 1] = 2 * (h1 - center) * locality_distance
                    pos_weight_data[position, 0] = 2 * (h2 - center) * locality_distance
            pos_weight_data = pos_weight_data * self.locality_strength
            self.blocks[i].attn.pos_proj.weight.set_data(pos_weight_data)

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        if self.use_pos_embed:
            x = x + self.pos_embed
        x = self.pos_drop(x)
        cls_tokens = ops.tile(self.cls_token, (x.shape[0], 1, 1))
        for u, blk in enumerate(self.blocks):
            if u == self.local_up_to_layer:
                x = ops.Cast()(x, cls_tokens.dtype)
                x = ops.concat((cls_tokens, x), 1)
            x = blk(x)
        x = self.norm(x)
        return x[:, 0]

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.convit.convit_base(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT base model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
398
399
400
401
402
403
404
405
406
407
408
409
410
@register_model
def convit_base(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT base model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_base"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=16, embed_dim=768, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.convit.convit_base_plus(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT base+ model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
413
414
415
416
417
418
419
420
421
422
423
424
425
@register_model
def convit_base_plus(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT base+ model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_base_plus"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=16, embed_dim=1024, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.convit.convit_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT small model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
368
369
370
371
372
373
374
375
376
377
378
379
380
@register_model
def convit_small(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT small model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_small"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=9, embed_dim=432, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.convit.convit_small_plus(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT small+ model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
383
384
385
386
387
388
389
390
391
392
393
394
395
@register_model
def convit_small_plus(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT small+ model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_small_plus"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=9, embed_dim=576, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.convit.convit_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT tiny model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
338
339
340
341
342
343
344
345
346
347
348
349
350
@register_model
def convit_tiny(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT tiny model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_tiny"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=4, embed_dim=192, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.convit.convit_tiny_plus(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConViT tiny+ model Refer to the base class "models.ConViT" for more details.

Source code in mindcv\models\convit.py
353
354
355
356
357
358
359
360
361
362
363
364
365
@register_model
def convit_tiny_plus(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> ConViT:
    """Get ConViT tiny+ model
    Refer to the base class "models.ConViT" for more details.
    """
    default_cfg = default_cfgs["convit_tiny_plus"]
    model = ConViT(in_channels=in_channels, num_classes=num_classes,
                   num_heads=4, embed_dim=256, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

convnext

mindcv.models.convnext

MindSpore implementation of ConvNeXt and ConvNeXt V2. Refer to: A ConvNet for the 2020s ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders

mindcv.models.convnext.Block

Bases: Cell

ConvNeXt Block There are two equivalent implementations: (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW. Args: dim: Number of input channels. drop_path: Stochastic depth rate. Default: 0.0. layer_scale_init_value: Init value for Layer Scale. Default: 1e-6.

Source code in mindcv\models\convnext.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class Block(nn.Cell):
    """ConvNeXt Block
    There are two equivalent implementations:
      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
    Args:
        dim: Number of input channels.
        drop_path: Stochastic depth rate. Default: 0.0.
        layer_scale_init_value: Init value for Layer Scale. Default: 1e-6.
    """

    def __init__(
        self,
        dim: int,
        drop_path: float = 0.0,
        layer_scale_init_value: float = 1e-6,
        use_grn: bool = False,
    ) -> None:
        super().__init__()
        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, group=dim, has_bias=True)  # depthwise conv
        self.norm = ConvNextLayerNorm((dim,), epsilon=1e-6)
        self.pwconv1 = nn.Dense(dim, 4 * dim)  # pointwise/1x1 convs, implemented with Dense layers
        self.act = nn.GELU()
        self.use_grn = use_grn
        if use_grn:
            self.grn = GRN(4 * dim)
        self.pwconv2 = nn.Dense(4 * dim, dim)
        self.gamma_ = Parameter(Tensor(layer_scale_init_value * np.ones((dim)), dtype=mstype.float32),
                                requires_grad=True) if layer_scale_init_value > 0 else None
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()

    def construct(self, x: Tensor) -> Tensor:
        downsample = x
        x = self.dwconv(x)
        x = ops.transpose(x, (0, 2, 3, 1))
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        if self.use_grn:
            x = self.grn(x)
        x = self.pwconv2(x)
        if self.gamma_ is not None:
            x = self.gamma_ * x
        x = ops.transpose(x, (0, 3, 1, 2))
        x = downsample + self.drop_path(x)
        return x

mindcv.models.convnext.ConvNeXt

Bases: Cell

ConvNeXt and ConvNeXt V2 model class, based on "A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>_ and "ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders" <https://arxiv.org/abs/2301.00808>_

PARAMETER DESCRIPTION
in_channels

dim of the input channel.

TYPE: int

num_classes

dim of the classes predicted.

TYPE: int

depths

the depths of each layer.

TYPE: List[int]

dims

the middle dim of each layer.

TYPE: List[int]

drop_path_rate

the rate of droppath. Default: 0.0.

TYPE: float DEFAULT: 0.0

layer_scale_init_value

the parameter of init for the classifier. Default: 1e-6.

TYPE: float DEFAULT: 1e-06

head_init_scale

the parameter of init for the head. Default: 1.0.

TYPE: float DEFAULT: 1.0

use_grn

If True, use Global Response Normalization in each block. Default: False.

TYPE: bool DEFAULT: False

Source code in mindcv\models\convnext.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
class ConvNeXt(nn.Cell):
    r"""ConvNeXt and ConvNeXt V2 model class, based on
    `"A ConvNet for the 2020s" <https://arxiv.org/abs/2201.03545>`_ and
    `"ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders" <https://arxiv.org/abs/2301.00808>`_

    Args:
        in_channels: dim of the input channel.
        num_classes: dim of the classes predicted.
        depths: the depths of each layer.
        dims: the middle dim of each layer.
        drop_path_rate: the rate of droppath. Default: 0.0.
        layer_scale_init_value: the parameter of init for the classifier. Default: 1e-6.
        head_init_scale: the parameter of init for the head. Default: 1.0.
        use_grn: If True, use Global Response Normalization in each block. Default: False.
    """

    def __init__(
        self,
        in_channels: int,
        num_classes: int,
        depths: List[int],
        dims: List[int],
        drop_path_rate: float = 0.0,
        layer_scale_init_value: float = 1e-6,
        head_init_scale: float = 1.0,
        use_grn: bool = False,
    ):
        super().__init__()

        downsample_layers = []  # stem and 3 intermediate down_sampling conv layers
        stem = nn.SequentialCell(
            nn.Conv2d(in_channels, dims[0], kernel_size=4, stride=4, has_bias=True),
            ConvNextLayerNorm((dims[0],), epsilon=1e-6, norm_axis=1),
        )
        downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.SequentialCell(
                ConvNextLayerNorm((dims[i],), epsilon=1e-6, norm_axis=1),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2, has_bias=True),
            )
            downsample_layers.append(downsample_layer)

        total_reduction = 4
        self.feature_info = []
        self.flatten_sequential = True

        stages = []  # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates = list(np.linspace(0, drop_path_rate, sum(depths)))
        cur = 0
        for i in range(4):
            blocks = []
            for j in range(depths[i]):
                blocks.append(Block(dim=dims[i], drop_path=dp_rates[cur + j],
                                    layer_scale_init_value=layer_scale_init_value, use_grn=use_grn))
            stage = nn.SequentialCell(blocks)
            stages.append(stage)
            cur += depths[i]

            if i > 0:
                total_reduction *= 2
            self.feature_info.append(dict(chs=dims[i], reduction=total_reduction, name=f'feature.{i * 2 + 1}'))

        self.feature = nn.SequentialCell([
            downsample_layers[0],
            stages[0],
            downsample_layers[1],
            stages[1],
            downsample_layers[2],
            stages[2],
            downsample_layers[3],
            stages[3]
        ])
        self.norm = ConvNextLayerNorm((dims[-1],), epsilon=1e-6)  # final norm layer
        self.classifier = nn.Dense(dims[-1], num_classes)  # classifier
        self.head_init_scale = head_init_scale
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, (nn.Dense, nn.Conv2d)):
                cell.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=0.02), cell.weight.shape, cell.weight.dtype)
                )
                if isinstance(cell, nn.Dense) and cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Zero(), cell.bias.shape, cell.bias.dtype))
        self.classifier.weight.set_data(self.classifier.weight * self.head_init_scale)
        self.classifier.bias.set_data(self.classifier.bias * self.head_init_scale)

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.classifier(x)
        return x

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.feature(x)
        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.convnext.ConvNextLayerNorm

Bases: LayerNorm

LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).

Source code in mindcv\models\convnext.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class ConvNextLayerNorm(nn.LayerNorm):
    """
    LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).
    """

    def __init__(
        self,
        normalized_shape: Tuple[int],
        epsilon: float,
        norm_axis: int = -1,
    ) -> None:
        super().__init__(normalized_shape=normalized_shape, epsilon=epsilon)
        assert norm_axis in (-1, 1), "ConvNextLayerNorm's norm_axis must be 1 or -1."
        self.norm_axis = norm_axis

    def construct(self, input_x: Tensor) -> Tensor:
        if self.norm_axis == -1:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        else:
            input_x = ops.transpose(input_x, (0, 2, 3, 1))
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
            y = ops.transpose(y, (0, 3, 1, 2))
        return y

mindcv.models.convnext.GRN

Bases: Cell

GRN (Global Response Normalization) layer

Source code in mindcv\models\convnext.py
65
66
67
68
69
70
71
72
73
74
75
76
77
class GRN(nn.Cell):
    """ GRN (Global Response Normalization) layer """

    def __init__(self, dim: int):
        super().__init__()
        self.gamma = Parameter(Tensor(np.zeros([1, 1, 1, dim]), mstype.float32))
        self.beta = Parameter(Tensor(np.zeros([1, 1, 1, dim]), mstype.float32))
        self.norm = ops.LpNorm(axis=[1, 2], p=2, keep_dims=True)

    def construct(self, x: Tensor) -> Tensor:
        gx = self.norm(x)
        nx = gx / (ops.mean(gx, axis=-1, keep_dims=True) + 1e-6)
        return self.gamma * (x * nx) + self.beta + x

mindcv.models.convnext.convnext_base(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt base model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
287
288
289
290
291
292
293
294
295
296
@register_model
def convnext_base(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt base model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnext_base"]
    model_args = dict(
        in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs
    )
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnext_large(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt large model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
299
300
301
302
303
304
305
306
307
308
@register_model
def convnext_large(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt large model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnext_large"]
    model_args = dict(
        in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs
    )
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnext_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt small model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
275
276
277
278
279
280
281
282
283
284
@register_model
def convnext_small(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt small model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnext_small"]
    model_args = dict(
        in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs
    )
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnext_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt tiny model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
263
264
265
266
267
268
269
270
271
272
@register_model
def convnext_tiny(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt tiny model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnext_tiny"]
    model_args = dict(
        in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs
    )
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnext_xlarge(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt xlarge model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
311
312
313
314
315
316
317
318
319
320
@register_model
def convnext_xlarge(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt xlarge model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnext_xlarge"]
    model_args = dict(
        in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs
    )
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_atto(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 atto model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
323
324
325
326
327
328
329
330
331
@register_model
def convnextv2_atto(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 atto model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_atto"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[2, 2, 6, 2],
                      dims=[40, 80, 160, 320], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_base(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 base model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
378
379
380
381
382
383
384
385
386
@register_model
def convnextv2_base(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 base model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_base"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3],
                      dims=[128, 256, 512, 1024], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_femto(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 femto model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
334
335
336
337
338
339
340
341
342
@register_model
def convnextv2_femto(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 femto model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_femto"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[2, 2, 6, 2],
                      dims=[48, 96, 192, 384], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_huge(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 huge model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
400
401
402
403
404
405
406
407
408
@register_model
def convnextv2_huge(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 huge model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_huge"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3],
                      dims=[352, 704, 1408, 2816], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_large(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 large model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
389
390
391
392
393
394
395
396
397
@register_model
def convnextv2_large(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 large model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_large"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 27, 3],
                      dims=[192, 384, 768, 1536], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_nano(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 nano model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
356
357
358
359
360
361
362
363
364
@register_model
def convnextv2_nano(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 nano model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_nano"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[2, 2, 8, 2],
                      dims=[80, 160, 320, 640], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_pico(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 pico model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
345
346
347
348
349
350
351
352
353
@register_model
def convnextv2_pico(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 pico model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_pico"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[2, 2, 6, 2],
                      dims=[64, 128, 256, 512], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.convnext.convnextv2_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ConvNeXt_v2 tiny model. Refer to the base class 'models.ConvNeXt' for more details.

Source code in mindcv\models\convnext.py
367
368
369
370
371
372
373
374
375
@register_model
def convnextv2_tiny(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ConvNeXt:
    """Get ConvNeXt_v2 tiny model.
    Refer to the base class 'models.ConvNeXt' for more details.
    """
    default_cfg = default_cfgs["convnextv2_tiny"]
    model_args = dict(in_channels=in_channels, num_classes=num_classes, depths=[3, 3, 9, 3],
                      dims=[96, 192, 384, 768], use_grn=True, layer_scale_init_value=0.0, **kwargs)
    return _create_convnext(pretrained, **dict(default_cfg=default_cfg, **model_args))

crossvit

mindcv.models.crossvit

MindSpore implementation of crossvit. Refer to crossvit: Cross-Attention Multi-Scale Vision Transformer for Image Classification

mindcv.models.crossvit.PatchEmbed

Bases: Cell

Image to Patch Embedding

Source code in mindcv\models\crossvit.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
class PatchEmbed(nn.Cell):
    """ Image to Patch Embedding
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, multi_conv=True):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        if multi_conv:
            if patch_size[0] == 12:
                self.proj = nn.SequentialCell(
                    nn.Conv2d(in_chans, embed_dim // 4, pad_mode='pad', kernel_size=7, stride=4, padding=3),
                    nn.ReLU(),
                    nn.Conv2d(embed_dim // 4, embed_dim // 2, pad_mode='pad', kernel_size=3, stride=3, padding=0),
                    nn.ReLU(),
                    nn.Conv2d(embed_dim // 2, embed_dim, pad_mode='pad', kernel_size=3, stride=1, padding=1),
                )
            elif patch_size[0] == 16:
                self.proj = nn.SequentialCell(
                    nn.Conv2d(in_chans, embed_dim // 4, pad_mode='pad', kernel_size=7, stride=4, padding=3),
                    nn.ReLU(),
                    nn.Conv2d(embed_dim // 4, embed_dim // 2, pad_mode='pad', kernel_size=3, stride=2, padding=1),
                    nn.ReLU(),
                    nn.Conv2d(embed_dim // 2, embed_dim, pad_mode='pad', kernel_size=3, stride=2, padding=1),
                )
        else:
            self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, pad_mode='valid',
                                  has_bias=True)

    def construct(self, x: Tensor) -> Tensor:
        B, C, H, W = x.shape
        # FIXME look at relaxing size constraints

        # assert H == self.img_size[0] and W == self.img_size[1], \
        # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
        x = self.proj(x)
        B, C, H, W = x.shape
        x = x.reshape(B, C, H * W)
        x = ops.transpose(x, (0, 2, 1))
        return x

mindcv.models.crossvit.VisionTransformer

Bases: Cell

Vision Transformer with support for patch or hybrid CNN input stage

Source code in mindcv\models\crossvit.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
class VisionTransformer(nn.Cell):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """

    def __init__(self, img_size=(224, 224), patch_size=(8, 16), in_channels=3, num_classes=1000, embed_dim=(192, 384),
                 depth=([1, 3, 1], [1, 3, 1], [1, 3, 1]),
                 num_heads=(6, 12), mlp_ratio=(2., 2., 4.), qkv_bias=False, qk_scale=None, drop_rate=0.,
                 attn_drop_rate=0.,
                 drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm, multi_conv=False):
        super().__init__()

        self.num_classes = num_classes
        if not isinstance(img_size, list):
            img_size = to_2tuple(img_size)
        self.img_size = img_size

        num_patches = _compute_num_patches(img_size, patch_size)
        self.num_branches = len(patch_size)
        self.interpolate = Interpolate(mode="bilinear", align_corners=True)

        patch_embed = []
        if hybrid_backbone is None:
            b = []
            for i in range(self.num_branches):
                c = ms.Parameter(Tensor(np.zeros([1, 1 + num_patches[i], embed_dim[i]], np.float32)),
                                 name='pos_embed.' + str(i))
                b.append(c)
            b = tuple(b)
            self.pos_embed = ms.ParameterTuple(b)
            for im_s, p, d in zip(img_size, patch_size, embed_dim):
                patch_embed.append(
                    PatchEmbed(img_size=im_s, patch_size=p, in_chans=in_channels, embed_dim=d, multi_conv=multi_conv))
            self.patch_embed = nn.CellList(patch_embed)

        d = []
        for i in range(self.num_branches):
            c = ms.Parameter(Tensor(np.zeros([1, 1, embed_dim[i]], np.float32)), name='cls_token.' + str(i))
            d.append(c)
        d = tuple(d)
        self.cls_token = ms.ParameterTuple(d)
        self.pos_drop = Dropout(p=drop_rate)

        total_depth = sum([sum(x[-2:]) for x in depth])
        dpr = np.linspace(0, drop_path_rate, total_depth)  # stochastic depth decay rule
        dpr_ptr = 0
        self.blocks = nn.CellList()
        for idx, block_cfg in enumerate(depth):
            curr_depth = max(block_cfg[:-1]) + block_cfg[-1]
            dpr_ = dpr[dpr_ptr:dpr_ptr + curr_depth]
            blk = MultiScaleBlock(embed_dim, num_patches, block_cfg, num_heads=num_heads, mlp_ratio=mlp_ratio,
                                  qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate,
                                  drop_path=dpr_,
                                  norm_layer=norm_layer)
            dpr_ptr += curr_depth
            self.blocks.append(blk)

        self.norm = nn.CellList([norm_layer((embed_dim[i],), epsilon=1e-6) for i in range(self.num_branches)])
        self.head = nn.CellList([nn.Dense(embed_dim[i], num_classes) if num_classes > 0 else Identity() for i in
                                 range(self.num_branches)])

        for i in range(self.num_branches):
            if self.pos_embed[i].requires_grad:
                tensor1 = init.initializer(TruncatedNormal(sigma=.02), self.pos_embed[i].data.shape, ms.float32)
                self.pos_embed[i].set_data(tensor1)
            tensor2 = init.initializer(TruncatedNormal(sigma=.02), self.cls_token[i].data.shape, ms.float32)
            self.cls_token[i].set_data(tensor2)

        self._initialize_weights()

    def _initialize_weights(self) -> None:
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.data.shape))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Constant(0), cell.bias.shape))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer(init.Constant(1), cell.gamma.shape))
                cell.beta.set_data(init.initializer(init.Constant(0), cell.beta.shape))

    def no_weight_decay(self):
        out = {'cls_token'}
        if self.pos_embed[0].requires_grad:
            out.add('pos_embed')
        return out

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=''):
        self.num_classes = num_classes
        self.head = nn.Dense(self.embed_dim, num_classes) if num_classes > 0 else Identity()

    def forward_features(self, x: Tensor) -> Tensor:
        B, C, H, W = x.shape
        xs = []
        # print(x)
        for i in range(self.num_branches):
            x_ = self.interpolate(x, size=(self.img_size[i], self.img_size[i])) if H != self.img_size[i] else x
            tmp = self.patch_embed[i](x_)
            z = self.cls_token[i].shape
            y = Tensor(np.ones((B, z[1], z[2])), dtype=mstype.float32)
            cls_tokens = self.cls_token[i]
            cls_tokens = cls_tokens.expand_as(y)  # stole cls_tokens impl from Phil Wang, thanks
            con = ops.Concat(1)
            cls_tokens = cls_tokens.astype("float32")
            tmp = tmp.astype("float32")
            tmp = con((cls_tokens, tmp))
            tmp = tmp + self.pos_embed[i]
            tmp = self.pos_drop(tmp)
            xs.append(tmp)

        for blk in self.blocks:
            xs = blk(xs)

        # NOTE: was before branch token section, move to here to assure all branch token are before layer norm
        k = 0
        xs2 = []
        for x in xs:
            xs2.append(self.norm[k](x))
            k = k + 1
        xs = xs2
        out = []
        for x in xs:
            out.append(x[:, 0])
        return out

    def forward_head(self, x: Tensor) -> Tensor:
        ce_logits = []
        zz = 0
        for c in x:
            ce_logits.append(self.head[zz](c))
            zz = zz + 1
        z = ops.stack([ce_logits[0], ce_logits[1]])
        op = ops.ReduceMean(keep_dims=False)
        ce_logits = op(z, 0)
        return ce_logits

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

densenet

mindcv.models.densenet

MindSpore implementation of DenseNet. Refer to: Densely Connected Convolutional Networks

mindcv.models.densenet.DenseNet

Bases: Cell

Densenet-BC model class, based on "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>_

PARAMETER DESCRIPTION
growth_rate

how many filters to add each layer (k in paper). Default: 32.

TYPE: int DEFAULT: 32

block_config

how many layers in each pooling block. Default: (6, 12, 24, 16).

TYPE: Tuple[int, int, int, int] DEFAULT: (6, 12, 24, 16)

num_init_features

number of filters in the first Conv2d. Default: 64.

TYPE: int DEFAULT: 64

bn_size

multiplicative factor for number of bottleneck layers (i.e. bn_size * k features in the bottleneck layer). Default: 4.

TYPE: int DEFAULT: 4

drop_rate

dropout rate after each dense layer. Default: 0.

TYPE: float DEFAULT: 0.0

in_channels

number of input channels. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\densenet.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class DenseNet(nn.Cell):
    r"""Densenet-BC model class, based on
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_

    Args:
        growth_rate: how many filters to add each layer (`k` in paper). Default: 32.
        block_config: how many layers in each pooling block. Default: (6, 12, 24, 16).
        num_init_features: number of filters in the first Conv2d. Default: 64.
        bn_size (int): multiplicative factor for number of bottleneck layers
          (i.e. bn_size * k features in the bottleneck layer). Default: 4.
        drop_rate: dropout rate after each dense layer. Default: 0.
        in_channels: number of input channels. Default: 3.
        num_classes: number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        growth_rate: int = 32,
        block_config: Tuple[int, int, int, int] = (6, 12, 24, 16),
        num_init_features: int = 64,
        bn_size: int = 4,
        drop_rate: float = 0.0,
        in_channels: int = 3,
        num_classes: int = 1000,
    ) -> None:
        super().__init__()
        layers = OrderedDict()
        # first Conv2d
        num_features = num_init_features
        layers["conv0"] = nn.Conv2d(in_channels, num_features, kernel_size=7, stride=2, pad_mode="pad", padding=3)
        layers["norm0"] = nn.BatchNorm2d(num_features)
        layers["relu0"] = nn.ReLU()
        layers["pool0"] = nn.SequentialCell([
            nn.Pad(paddings=((0, 0), (0, 0), (1, 1), (1, 1)), mode="CONSTANT"),
            nn.MaxPool2d(kernel_size=3, stride=2),
        ])

        # DenseBlock
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(
                num_layers=num_layers,
                num_input_features=num_features,
                bn_size=bn_size,
                growth_rate=growth_rate,
                drop_rate=drop_rate,
            )
            layers[f"denseblock{i + 1}"] = block
            num_features += num_layers * growth_rate
            if i != len(block_config) - 1:
                transition = _Transition(num_features, num_features // 2)
                layers[f"transition{i + 1}"] = transition
                num_features = num_features // 2

        # final bn+ReLU
        layers["norm5"] = nn.BatchNorm2d(num_features)
        layers["relu5"] = nn.ReLU()

        self.num_features = num_features
        self.features = nn.SequentialCell(layers)
        self.pool = GlobalAvgPooling()
        self.classifier = nn.Dense(self.num_features, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(math.sqrt(5), mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                         cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.densenet.densenet121(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 121 layers DenseNet model. Refer to the base class models.DenseNet for more details.

Source code in mindcv\models\densenet.py
225
226
227
228
229
230
231
232
233
234
235
236
@register_model
def densenet121(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DenseNet:
    """Get 121 layers DenseNet model.
     Refer to the base class `models.DenseNet` for more details."""
    default_cfg = default_cfgs["densenet121"]
    model = DenseNet(growth_rate=32, block_config=(6, 12, 24, 16), num_init_features=64, in_channels=in_channels,
                     num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.densenet.densenet161(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 161 layers DenseNet model. Refer to the base class models.DenseNet for more details.

Source code in mindcv\models\densenet.py
239
240
241
242
243
244
245
246
247
248
249
250
@register_model
def densenet161(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DenseNet:
    """Get 161 layers DenseNet model.
     Refer to the base class `models.DenseNet` for more details."""
    default_cfg = default_cfgs["densenet161"]
    model = DenseNet(growth_rate=48, block_config=(6, 12, 36, 24), num_init_features=96, in_channels=in_channels,
                     num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.densenet.densenet169(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 169 layers DenseNet model. Refer to the base class models.DenseNet for more details.

Source code in mindcv\models\densenet.py
253
254
255
256
257
258
259
260
261
262
263
264
@register_model
def densenet169(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DenseNet:
    """Get 169 layers DenseNet model.
     Refer to the base class `models.DenseNet` for more details."""
    default_cfg = default_cfgs["densenet169"]
    model = DenseNet(growth_rate=32, block_config=(6, 12, 32, 32), num_init_features=64, in_channels=in_channels,
                     num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.densenet.densenet201(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 201 layers DenseNet model. Refer to the base class models.DenseNet for more details.

Source code in mindcv\models\densenet.py
267
268
269
270
271
272
273
274
275
276
277
278
@register_model
def densenet201(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DenseNet:
    """Get 201 layers DenseNet model.
     Refer to the base class `models.DenseNet` for more details."""
    default_cfg = default_cfgs["densenet201"]
    model = DenseNet(growth_rate=32, block_config=(6, 12, 48, 32), num_init_features=64, in_channels=in_channels,
                     num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

dpn

mindcv.models.dpn

MindSpore implementation of DPN. Refer to: Dual Path Networks

mindcv.models.dpn.BottleBlock

Bases: Cell

A block for the Dual Path Architecture

Source code in mindcv\models\dpn.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class BottleBlock(nn.Cell):
    """A block for the Dual Path Architecture"""

    def __init__(
        self,
        in_channel: int,
        num_1x1_a: int,
        num_3x3_b: int,
        num_1x1_c: int,
        inc: int,
        g: int,
        key_stride: int,
    ):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(in_channel, eps=1e-3, momentum=0.9)
        self.conv1 = nn.Conv2d(in_channel, num_1x1_a, 1, stride=1)
        self.bn2 = nn.BatchNorm2d(num_1x1_a, eps=1e-3, momentum=0.9)
        self.conv2 = nn.Conv2d(num_1x1_a, num_3x3_b, 3, key_stride, pad_mode="pad", padding=1, group=g)
        self.bn3 = nn.BatchNorm2d(num_3x3_b, eps=1e-3, momentum=0.9)
        self.conv3_r = nn.Conv2d(num_3x3_b, num_1x1_c, 1, stride=1)
        self.conv3_d = nn.Conv2d(num_3x3_b, inc, 1, stride=1)

        self.relu = nn.ReLU()

    def construct(self, x: Tensor):
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv1(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn3(x)
        x = self.relu(x)
        return (self.conv3_r(x), self.conv3_d(x))

mindcv.models.dpn.DPN

Bases: Cell

DPN model class, based on "Dual Path Networks" <https://arxiv.org/pdf/1707.01629.pdf>_

PARAMETER DESCRIPTION
num_init_channel

int type, the output channel of first blocks. Default: 64.

TYPE: int DEFAULT: 64

k_r

int type, the first channel of each stage. Default: 96.

TYPE: int DEFAULT: 96

g

int type,number of group in the conv2d. Default: 32.

TYPE: int DEFAULT: 32

k_sec

multiplicative factor for number of bottleneck layers. Default: 4.

TYPE: Tuple[int] DEFAULT: (3, 4, 20, 3)

inc_sec

the first output channel in each stage. Default: (16, 32, 24, 128).

TYPE: Tuple[int] DEFAULT: (16, 32, 24, 128)

in_channels

int type, number of input channels. Default: 3.

TYPE: int DEFAULT: 3

num_classes

int type, number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\dpn.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
class DPN(nn.Cell):
    r"""DPN model class, based on
    `"Dual Path Networks" <https://arxiv.org/pdf/1707.01629.pdf>`_

    Args:
        num_init_channel: int type, the output channel of first blocks. Default: 64.
        k_r: int type, the first channel of each stage. Default: 96.
        g: int type,number of group in the conv2d. Default: 32.
        k_sec Tuple[int]: multiplicative factor for number of bottleneck layers. Default: 4.
        inc_sec Tuple[int]: the first output channel in each stage. Default: (16, 32, 24, 128).
        in_channels: int type, number of input channels. Default: 3.
        num_classes: int type, number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        num_init_channel: int = 64,
        k_r: int = 96,
        g: int = 32,
        k_sec: Tuple[int, int, int, int] = (3, 4, 20, 3),
        inc_sec: Tuple[int, int, int, int] = (16, 32, 24, 128),
        in_channels: int = 3,
        num_classes: int = 1000,
    ):
        super().__init__()
        blocks = OrderedDict()

        # conv1
        blocks["conv1"] = nn.SequentialCell(OrderedDict([
            ("conv", nn.Conv2d(in_channels, num_init_channel, kernel_size=7, stride=2, pad_mode="pad", padding=3)),
            ("norm", nn.BatchNorm2d(num_init_channel, eps=1e-3, momentum=0.9)),
            ("relu", nn.ReLU()),
            ("maxpool", nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")),
        ]))

        # conv2
        bw = 256
        inc = inc_sec[0]
        r = int((k_r * bw) / 256)
        blocks["conv2_1"] = DualPathBlock(num_init_channel, r, r, bw, inc, g, "proj", False)
        in_channel = bw + 3 * inc
        for i in range(2, k_sec[0] + 1):
            blocks[f"conv2_{i}"] = DualPathBlock(in_channel, r, r, bw, inc, g, "normal")
            in_channel += inc

        # conv3
        bw = 512
        inc = inc_sec[1]
        r = int((k_r * bw) / 256)
        blocks["conv3_1"] = DualPathBlock(in_channel, r, r, bw, inc, g, "down")
        in_channel = bw + 3 * inc
        for i in range(2, k_sec[1] + 1):
            blocks[f"conv3_{i}"] = DualPathBlock(in_channel, r, r, bw, inc, g, "normal")
            in_channel += inc

        # conv4
        bw = 1024
        inc = inc_sec[2]
        r = int((k_r * bw) / 256)
        blocks["conv4_1"] = DualPathBlock(in_channel, r, r, bw, inc, g, "down")
        in_channel = bw + 3 * inc
        for i in range(2, k_sec[2] + 1):
            blocks[f"conv4_{i}"] = DualPathBlock(in_channel, r, r, bw, inc, g, "normal")
            in_channel += inc

        # conv5
        bw = 2048
        inc = inc_sec[3]
        r = int((k_r * bw) / 256)
        blocks["conv5_1"] = DualPathBlock(in_channel, r, r, bw, inc, g, "down")
        in_channel = bw + 3 * inc
        for i in range(2, k_sec[3] + 1):
            blocks[f"conv5_{i}"] = DualPathBlock(in_channel, r, r, bw, inc, g, "normal")
            in_channel += inc

        self.features = nn.SequentialCell(blocks)
        self.conv5_x = nn.SequentialCell(OrderedDict([
            ("norm", nn.BatchNorm2d(in_channel, eps=1e-3, momentum=0.9)),
            ("relu", nn.ReLU()),
        ]))
        self.avgpool = GlobalAvgPooling()
        self.classifier = nn.Dense(in_channel, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(math.sqrt(5), mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                         cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_feature(self, x: Tensor) -> Tensor:
        x = self.features(x)
        x = ops.concat(x, axis=1)
        x = self.conv5_x(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.avgpool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_feature(x)
        x = self.forward_head(x)
        return x

mindcv.models.dpn.DualPathBlock

Bases: Cell

A block for Dual Path Networks to combine proj, residual and densely network

Source code in mindcv\models\dpn.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
class DualPathBlock(nn.Cell):
    """A block for Dual Path Networks to combine proj, residual and densely network"""

    def __init__(
        self,
        in_channel: int,
        num_1x1_a: int,
        num_3x3_b: int,
        num_1x1_c: int,
        inc: int,
        g: int,
        _type: str = "normal",
        cat_input: bool = True,
    ):
        super().__init__()
        self.num_1x1_c = num_1x1_c

        if _type == "proj":
            key_stride = 1
            self.has_proj = True
        if _type == "down":
            key_stride = 2
            self.has_proj = True
        if _type == "normal":
            key_stride = 1
            self.has_proj = False

        self.cat_input = cat_input

        if self.has_proj:
            self.c1x1_w_bn = nn.BatchNorm2d(in_channel, eps=1e-3, momentum=0.9)
            self.c1x1_w_relu = nn.ReLU()
            self.c1x1_w_r = nn.Conv2d(in_channel, num_1x1_c, kernel_size=1, stride=key_stride,
                                      pad_mode="pad", padding=0)
            self.c1x1_w_d = nn.Conv2d(in_channel, 2 * inc, kernel_size=1, stride=key_stride,
                                      pad_mode="pad", padding=0)

        self.layers = BottleBlock(in_channel, num_1x1_a, num_3x3_b, num_1x1_c, inc, g, key_stride)

    def construct(self, x: Tensor):
        if self.cat_input:
            data_in = ops.concat(x, axis=1)
        else:
            data_in = x

        if self.has_proj:
            data_o = self.c1x1_w_bn(data_in)
            data_o = self.c1x1_w_relu(data_o)
            data_o1 = self.c1x1_w_r(data_o)
            data_o2 = self.c1x1_w_d(data_o)
        else:
            data_o1 = x[0]
            data_o2 = x[1]

        out = self.layers(data_in)
        summ = ops.add(data_o1, out[0])
        dense = ops.concat((data_o2, out[1]), axis=1)
        return (summ, dense)

mindcv.models.dpn.dpn107(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 107 layers DPN model. Refer to the base class models.DPN for more details.

Source code in mindcv\models\dpn.py
304
305
306
307
308
309
310
311
312
313
314
315
@register_model
def dpn107(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DPN:
    """Get 107 layers DPN model.
     Refer to the base class `models.DPN` for more details."""
    default_cfg = default_cfgs["dpn107"]
    model = DPN(num_init_channel=128, k_r=200, g=50, k_sec=(4, 8, 20, 3), inc_sec=(20, 64, 64, 128),
                num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.dpn.dpn131(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 131 layers DPN model. Refer to the base class models.DPN for more details.

Source code in mindcv\models\dpn.py
290
291
292
293
294
295
296
297
298
299
300
301
@register_model
def dpn131(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DPN:
    """Get 131 layers DPN model.
     Refer to the base class `models.DPN` for more details."""
    default_cfg = default_cfgs["dpn131"]
    model = DPN(num_init_channel=128, k_r=160, g=40, k_sec=(4, 8, 28, 3), inc_sec=(16, 32, 32, 128),
                num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.dpn.dpn92(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 92 layers DPN model. Refer to the base class models.DPN for more details.

Source code in mindcv\models\dpn.py
262
263
264
265
266
267
268
269
270
271
272
273
@register_model
def dpn92(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DPN:
    """Get 92 layers DPN model.
     Refer to the base class `models.DPN` for more details."""
    default_cfg = default_cfgs["dpn92"]
    model = DPN(num_init_channel=64, k_r=96, g=32, k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128),
                num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.dpn.dpn98(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 98 layers DPN model. Refer to the base class models.DPN for more details.

Source code in mindcv\models\dpn.py
276
277
278
279
280
281
282
283
284
285
286
287
@register_model
def dpn98(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> DPN:
    """Get 98 layers DPN model.
     Refer to the base class `models.DPN` for more details."""
    default_cfg = default_cfgs["dpn98"]
    model = DPN(num_init_channel=96, k_r=160, g=40, k_sec=(3, 6, 20, 3), inc_sec=(16, 32, 32, 128),
                num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

edgenext

mindcv.models.edgenext

MindSpore implementation of edgenext. Refer to EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications.

mindcv.models.edgenext.EdgeNeXt

Bases: Cell

EdgeNeXt model class, based on "Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision" <https://arxiv.org/abs/2206.10589>_

PARAMETER DESCRIPTION
in_channels

number of input channels. Default: 3

num_classes

number of classification classes. Default: 1000

DEFAULT: 1000

depths

the depths of each layer. Default: [0, 0, 0, 3]

DEFAULT: [3, 3, 9, 3]

dims

the middle dim of each layer. Default: [24, 48, 88, 168]

DEFAULT: [24, 48, 88, 168]

global_block

number of global block. Default: [0, 0, 0, 3]

DEFAULT: [0, 0, 0, 3]

global_block_type

type of global block. Default: ['None', 'None', 'None', 'SDTA']

DEFAULT: ['None', 'None', 'None', 'SDTA']

drop_path_rate

Stochastic Depth. Default: 0.

DEFAULT: 0.0

layer_scale_init_value

value of layer scale initialization. Default: 1e-6

DEFAULT: 1e-06

head_init_scale

scale of head initialization. Default: 1.

DEFAULT: 1.0

expan_ratio

ratio of expansion. Default: 4

DEFAULT: 4

kernel_sizes

kernel sizes of different stages. Default: [7, 7, 7, 7]

DEFAULT: [7, 7, 7, 7]

heads

number of attention heads. Default: [8, 8, 8, 8]

DEFAULT: [8, 8, 8, 8]

use_pos_embd_xca

use position embedding in xca or not. Default: [False, False, False, False]

DEFAULT: [False, False, False, False]

use_pos_embd_global

use position embedding globally or not. Default: False

DEFAULT: False

d2_scales

scales of splitting channels

DEFAULT: [2, 3, 4, 5]

Source code in mindcv\models\edgenext.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
class EdgeNeXt(nn.Cell):
    r"""EdgeNeXt model class, based on
    `"Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision" <https://arxiv.org/abs/2206.10589>`_

    Args:
        in_channels: number of input channels. Default: 3
        num_classes: number of classification classes. Default: 1000
        depths: the depths of each layer. Default: [0, 0, 0, 3]
        dims: the middle dim of each layer. Default: [24, 48, 88, 168]
        global_block: number of global block. Default: [0, 0, 0, 3]
        global_block_type: type of global block. Default: ['None', 'None', 'None', 'SDTA']
        drop_path_rate: Stochastic Depth. Default: 0.
        layer_scale_init_value: value of layer scale initialization. Default: 1e-6
        head_init_scale: scale of head initialization. Default: 1.
        expan_ratio: ratio of expansion. Default: 4
        kernel_sizes: kernel sizes of different stages. Default: [7, 7, 7, 7]
        heads: number of attention heads. Default: [8, 8, 8, 8]
        use_pos_embd_xca: use position embedding in xca or not. Default: [False, False, False, False]
        use_pos_embd_global: use position embedding globally or not. Default: False
        d2_scales: scales of splitting channels
    """
    def __init__(self, in_chans=3, num_classes=1000,
                 depths=[3, 3, 9, 3], dims=[24, 48, 88, 168],
                 global_block=[0, 0, 0, 3], global_block_type=["None", "None", "None", "SDTA"],
                 drop_path_rate=0., layer_scale_init_value=1e-6, head_init_scale=1., expan_ratio=4,
                 kernel_sizes=[7, 7, 7, 7], heads=[8, 8, 8, 8], use_pos_embd_xca=[False, False, False, False],
                 use_pos_embd_global=False, d2_scales=[2, 3, 4, 5], **kwargs):
        super().__init__()
        for g in global_block_type:
            assert g in ["None", "SDTA"]
        if use_pos_embd_global:
            self.pos_embd = PositionalEncodingFourier(dim=dims[0])
        else:
            self.pos_embd = None
        self.downsample_layers = nn.CellList()  # stem and 3 intermediate downsampling conv layers
        stem = nn.SequentialCell(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4, has_bias=True),
            LayerNorm((dims[0],), epsilon=1e-6, norm_axis=1),
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.SequentialCell(
                LayerNorm((dims[i],), epsilon=1e-6, norm_axis=1),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2, has_bias=True),
            )
            self.downsample_layers.append(downsample_layer)

        self.stages = nn.CellList()  # 4 feature resolution stages, each consisting of multiple residual blocks
        dp_rates = list(np.linspace(0, drop_path_rate, sum(depths)))
        cur = 0
        for i in range(4):
            stage_blocks = []
            for j in range(depths[i]):
                if j > depths[i] - global_block[i] - 1:
                    if global_block_type[i] == "SDTA":
                        stage_blocks.append(SDTAEncoder(dim=dims[i], drop_path=dp_rates[cur + j],
                                                        expan_ratio=expan_ratio, scales=d2_scales[i],
                                                        use_pos_emb=use_pos_embd_xca[i], num_heads=heads[i]))
                    else:
                        raise NotImplementedError
                else:
                    stage_blocks.append(ConvEncoder(dim=dims[i], drop_path=dp_rates[cur + j],
                                                    layer_scale_init_value=layer_scale_init_value,
                                                    expan_ratio=expan_ratio, kernel_size=kernel_sizes[i]))

            self.stages.append(nn.SequentialCell(*stage_blocks))
            cur += depths[i]
        self.norm = nn.LayerNorm((dims[-1],), epsilon=1e-6)  # Final norm layer
        self.head = nn.Dense(dims[-1], num_classes)

        # self.head_dropout = Dropout(kwargs["classifier_dropout"])
        self.head_dropout = Dropout(p=0.0)
        self.head_init_scale = head_init_scale
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, (nn.Dense, nn.Conv2d)):
                cell.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=0.02), cell.weight.shape, cell.weight.dtype)
                )
                if isinstance(cell, nn.Dense) and cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Zero(), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, (nn.LayerNorm)):
                cell.gamma.set_data(init.initializer(init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer(init.Zero(), cell.beta.shape, cell.beta.dtype))
        self.head.weight.set_data(self.head.weight * self.head_init_scale)
        self.head.bias.set_data(self.head.bias * self.head_init_scale)

    def forward_features(self, x):
        x = self.downsample_layers[0](x)
        x = self.stages[0](x)
        if self.pos_embd is not None:
            B, C, H, W = x.shape
            x = x + self.pos_embd(B, H, W)
        for i in range(1, 4):
            x = self.downsample_layers[i](x)
            x = self.stages[i](x)
        return self.norm(x.mean([-2, -1]))  # Global average pooling, (N, C, H, W) -> (N, C)

    def construct(self, x):
        x = self.forward_features(x)
        x = self.head(self.head_dropout(x))
        return x

mindcv.models.edgenext.LayerNorm

Bases: LayerNorm

LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).

Source code in mindcv\models\edgenext.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class LayerNorm(nn.LayerNorm):
    r"""LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W)."""

    def __init__(
        self,
        normalized_shape: Tuple[int],
        epsilon: float,
        norm_axis: int = -1,
    ) -> None:
        super().__init__(normalized_shape=normalized_shape, epsilon=epsilon)
        assert norm_axis in (-1, 1), "ConvNextLayerNorm's norm_axis must be 1 or -1."
        self.norm_axis = norm_axis

    def construct(self, input_x: Tensor) -> Tensor:
        if self.norm_axis == -1:
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
        else:
            input_x = ops.transpose(input_x, (0, 2, 3, 1))
            y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
            y = ops.transpose(y, (0, 3, 1, 2))
        return y

mindcv.models.edgenext.edgenext_base(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get edgenext_base model. Refer to the base class models.EdgeNeXt for more details.

Source code in mindcv\models\edgenext.py
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
@register_model
def edgenext_base(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> EdgeNeXt:
    """Get edgenext_base model.
    Refer to the base class `models.EdgeNeXt` for more details."""
    default_cfg = default_cfgs["edgenext_base"]
    model = EdgeNeXt(
        depths=[3, 3, 9, 3],
        dims=[80, 160, 288, 584],
        expan_ratio=4,
        num_classes=num_classes,
        global_block=[0, 1, 1, 1],
        global_block_type=["None", "SDTA", "SDTA", "SDTA"],
        use_pos_embd_xca=[False, True, False, False],
        kernel_sizes=[3, 5, 7, 9],
        d2_scales=[2, 2, 3, 4],
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.edgenext.edgenext_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get edgenext_small model. Refer to the base class models.EdgeNeXt for more details.

Source code in mindcv\models\edgenext.py
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
@register_model
def edgenext_small(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> EdgeNeXt:
    """Get edgenext_small model.
    Refer to the base class `models.EdgeNeXt` for more details."""
    default_cfg = default_cfgs["edgenext_small"]
    model = EdgeNeXt(
        depths=[3, 3, 9, 3],
        dims=[48, 96, 160, 304],
        expan_ratio=4,
        num_classes=num_classes,
        global_block=[0, 1, 1, 1],
        global_block_type=["None", "SDTA", "SDTA", "SDTA"],
        use_pos_embd_xca=[False, True, False, False],
        kernel_sizes=[3, 5, 7, 9],
        d2_scales=[2, 2, 3, 4],
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.edgenext.edgenext_x_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get edgenext_x_small model. Refer to the base class models.EdgeNeXt for more details.

Source code in mindcv\models\edgenext.py
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@register_model
def edgenext_x_small(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> EdgeNeXt:
    """Get edgenext_x_small model.
    Refer to the base class `models.EdgeNeXt` for more details."""
    default_cfg = default_cfgs["edgenext_x_small"]
    model = EdgeNeXt(
        depths=[3, 3, 9, 3],
        dims=[32, 64, 100, 192],
        expan_ratio=4,
        num_classes=num_classes,
        global_block=[0, 1, 1, 1],
        global_block_type=["None", "SDTA", "SDTA", "SDTA"],
        use_pos_embd_xca=[False, True, False, False],
        kernel_sizes=[3, 5, 7, 9],
        heads=[4, 4, 4, 4],
        d2_scales=[2, 2, 3, 4],
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.edgenext.edgenext_xx_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get edgenext_xx_small model. Refer to the base class models.EdgeNeXt for more details.

Source code in mindcv\models\edgenext.py
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
@register_model
def edgenext_xx_small(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> EdgeNeXt:
    """Get edgenext_xx_small model.
        Refer to the base class `models.EdgeNeXt` for more details."""
    default_cfg = default_cfgs["edgenext_xx_small"]
    model = EdgeNeXt(
        depths=[2, 2, 6, 2],
        dims=[24, 48, 88, 168],
        expan_ratio=4,
        num_classes=num_classes,
        global_block=[0, 1, 1, 1],
        global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'],
        use_pos_embd_xca=[False, True, False, False],
        kernel_sizes=[3, 5, 7, 9],
        heads=[4, 4, 4, 4],
        d2_scales=[2, 2, 3, 4],
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

efficientnet

mindcv.models.efficientnet

EfficientNet Architecture.

mindcv.models.efficientnet.EfficientNet

Bases: Cell

EfficientNet architecture. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
arch

The name of the model.

TYPE: str

dropout_rate

The dropout rate of efficientnet.

TYPE: float

width_mult

The ratio of the channel. Default: 1.0.

TYPE: float DEFAULT: 1.0

depth_mult

The ratio of num_layers. Default: 1.0.

TYPE: float DEFAULT: 1.0

in_channels

The input channels. Default: 3.

TYPE: int DEFAULT: 3

num_classes

The number of class. Default: 1000.

TYPE: int DEFAULT: 1000

inverted_residual_setting

The settings of block. Default: None.

TYPE: Sequence[Union[MBConvConfig, FusedMBConvConfig]] DEFAULT: None

drop_path_prob

The drop path rate of MBConv. Default: 0.2.

TYPE: float DEFAULT: 0.2

norm_layer

The normalization layer. Default: None.

TYPE: Cell DEFAULT: None

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, 1000).

Source code in mindcv\models\efficientnet.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
class EfficientNet(nn.Cell):
    """
    EfficientNet architecture.
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        arch (str): The name of the model.
        dropout_rate (float): The dropout rate of efficientnet.
        width_mult (float): The ratio of the channel. Default: 1.0.
        depth_mult (float): The ratio of num_layers. Default: 1.0.
        in_channels (int): The input channels. Default: 3.
        num_classes (int): The number of class. Default: 1000.
        inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]], optional): The settings of block.
            Default: None.
        drop_path_prob (float): The drop path rate of MBConv. Default: 0.2.
        norm_layer (nn.Cell, optional): The normalization layer. Default: None.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, 1000)`.
    """

    def __init__(
        self,
        arch: str,
        dropout_rate: float,
        width_mult: float = 1.0,
        depth_mult: float = 1.0,
        in_channels: int = 3,
        num_classes: int = 1000,
        inverted_residual_setting: Optional[Sequence[Union[MBConvConfig, FusedMBConvConfig]]] = None,
        drop_path_prob: float = 0.2,
        norm_layer: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        self.last_channel = None

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
            if width_mult >= 1.6:
                norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.99)

        layers: List[nn.Cell] = []

        if not inverted_residual_setting:
            if arch.startswith("efficientnet_b"):
                bneck_conf = partial(MBConvConfig, width_cnf=width_mult, depth_cnf=depth_mult)
                inverted_residual_setting = [
                    bneck_conf(1, 3, 1, 32, 16, 1),
                    bneck_conf(6, 3, 2, 16, 24, 2),
                    bneck_conf(6, 5, 2, 24, 40, 2),
                    bneck_conf(6, 3, 2, 40, 80, 3),
                    bneck_conf(6, 5, 1, 80, 112, 3),
                    bneck_conf(6, 5, 2, 112, 192, 4),
                    bneck_conf(6, 3, 1, 192, 320, 1),
                ]
            elif arch.startswith("efficientnet_v2_s"):
                inverted_residual_setting = [
                    FusedMBConvConfig(1, 3, 1, 24, 24, 2),
                    FusedMBConvConfig(4, 3, 2, 24, 48, 4),
                    FusedMBConvConfig(4, 3, 2, 48, 64, 4),
                    MBConvConfig(4, 3, 2, 64, 128, 6),
                    MBConvConfig(6, 3, 1, 128, 160, 9),
                    MBConvConfig(6, 3, 2, 160, 256, 15),
                ]
                self.last_channel = 1280
            elif arch.startswith("efficientnet_v2_m"):
                inverted_residual_setting = [
                    FusedMBConvConfig(1, 3, 1, 24, 24, 3),
                    FusedMBConvConfig(4, 3, 2, 24, 48, 5),
                    FusedMBConvConfig(4, 3, 2, 48, 80, 5),
                    MBConvConfig(4, 3, 2, 80, 160, 7),
                    MBConvConfig(6, 3, 1, 160, 176, 14),
                    MBConvConfig(6, 3, 2, 176, 304, 18),
                    MBConvConfig(6, 3, 1, 304, 512, 5),
                ]
                self.last_channel = 1280
            elif arch.startswith("efficientnet_v2_l"):
                inverted_residual_setting = [
                    FusedMBConvConfig(1, 3, 1, 32, 32, 4),
                    FusedMBConvConfig(4, 3, 2, 32, 64, 7),
                    FusedMBConvConfig(4, 3, 2, 64, 96, 7),
                    MBConvConfig(4, 3, 2, 96, 192, 10),
                    MBConvConfig(6, 3, 1, 192, 224, 19),
                    MBConvConfig(6, 3, 2, 224, 384, 25),
                    MBConvConfig(6, 3, 1, 384, 640, 7),
                ]
                self.last_channel = 1280
            elif arch.startswith("efficientnet_v2_xl"):
                inverted_residual_setting = [
                    FusedMBConvConfig(1, 3, 1, 32, 32, 4),
                    FusedMBConvConfig(4, 3, 2, 32, 64, 8),
                    FusedMBConvConfig(4, 3, 2, 64, 96, 8),
                    MBConvConfig(4, 3, 2, 96, 192, 16),
                    MBConvConfig(6, 3, 1, 192, 256, 24),
                    MBConvConfig(6, 3, 2, 256, 512, 32),
                    MBConvConfig(6, 3, 1, 512, 640, 8),
                ]
                self.last_channel = 1280

        # building first layer
        firstconv_output_channels = inverted_residual_setting[0].input_channels
        layers.extend([
            nn.Conv2d(in_channels, firstconv_output_channels, kernel_size=3, stride=2),
            norm_layer(firstconv_output_channels),
            Swish(),
        ])

        total_reduction = 2
        self.feature_info = [dict(chs=firstconv_output_channels, reduction=total_reduction,
                                  name=f'features.{len(layers) - 1}')]

        # building MBConv blocks
        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
        stage_block_id = 0

        # cnf is the settings of block
        for cnf in inverted_residual_setting:
            stage: List[nn.Cell] = []

            # cnf.num_layers is the num of the same block
            for _ in range(cnf.num_layers):
                # copy to avoid modifications. shallow copy is enough
                block_cnf = copy.copy(cnf)

                block = MBConv

                if "FusedMBConvConfig" in str(type(block_cnf)):
                    block = FusedMBConv

                # overwrite info if not the first conv in the stage
                if stage:
                    block_cnf.input_channels = block_cnf.out_channels
                    block_cnf.stride = 1

                # adjust dropout rate of blocks based on the depth of the stage block
                sd_prob = drop_path_prob * float(stage_block_id) / total_stage_blocks

                total_reduction *= block_cnf.stride

                stage.append(block(block_cnf, sd_prob, norm_layer))
                stage_block_id += 1

            layers.append(nn.SequentialCell(stage))

            self.feature_info.append(dict(chs=cnf.out_channels, reduction=total_reduction,
                                          name=f'features.{len(layers) - 1}'))

        # building last several layers
        lastconv_input_channels = inverted_residual_setting[-1].out_channels
        lastconv_output_channels = self.last_channel if self.last_channel is not None else 4 * lastconv_input_channels
        layers.extend([
            nn.Conv2d(lastconv_input_channels, lastconv_output_channels, kernel_size=1),
            norm_layer(lastconv_output_channels),
            Swish(),
        ])

        self.feature_info.append(dict(chs=lastconv_output_channels, reduction=total_reduction,
                                      name=f'features.{len(layers) - 1}'))
        self.flatten_sequential = True

        self.features = nn.SequentialCell(layers)
        self.avgpool = GlobalAvgPooling()
        self.dropout = Dropout(p=dropout_rate)
        self.mlp_head = nn.Dense(lastconv_output_channels, num_classes)
        self._initialize_weights()

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)

        x = self.avgpool(x)

        if self.training:
            x = self.dropout(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        return self.mlp_head(x)

    def construct(self, x: Tensor) -> Tensor:
        """construct"""
        x = self.forward_features(x)
        return self.forward_head(x)

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                init_range = 1.0 / np.sqrt(cell.weight.shape[0])
                cell.weight.set_data(weight_init.initializer(Uniform(init_range), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))
            if isinstance(cell, nn.Conv2d):
                out_channel, _, kernel_size_h, kernel_size_w = cell.weight.shape
                stddev = np.sqrt(2 / int(out_channel * kernel_size_h * kernel_size_w))
                cell.weight.set_data(
                    weight_init.initializer(Normal(sigma=stddev), cell.weight.shape, cell.weight.dtype)
                )
                if cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))
mindcv.models.efficientnet.EfficientNet.construct(x)

construct

Source code in mindcv\models\efficientnet.py
456
457
458
459
def construct(self, x: Tensor) -> Tensor:
    """construct"""
    x = self.forward_features(x)
    return self.forward_head(x)

mindcv.models.efficientnet.FusedMBConv

Bases: Cell

FusedMBConv

Source code in mindcv\models\efficientnet.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
class FusedMBConv(nn.Cell):
    """FusedMBConv"""

    def __init__(
        self,
        cnf: FusedMBConvConfig,
        drop_path_prob: float,
        norm: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()

        if not 1 <= cnf.stride <= 2:
            raise ValueError("illegal stride value")

        self.shortcut = cnf.stride == 1 and cnf.input_channels == cnf.out_channels

        layers: List[nn.Cell] = []

        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
        if expanded_channels != cnf.input_channels:
            # fused expand
            layers.extend([
                nn.Conv2d(cnf.input_channels, expanded_channels, kernel_size=cnf.kernel_size,
                          stride=cnf.stride),
                norm(expanded_channels),
                Swish(),
            ])

            # project
            layers.extend([
                nn.Conv2d(expanded_channels, cnf.out_channels, kernel_size=1),
                norm(cnf.out_channels),
            ])
        else:
            layers.extend([
                nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=cnf.kernel_size,
                          stride=cnf.stride),
                norm(cnf.out_channels),
                Swish(),
            ])

        self.block = nn.SequentialCell(layers)
        self.dropout = DropPath(drop_path_prob)
        self.out_channels = cnf.out_channels

    def construct(self, x) -> Tensor:
        result = self.block(x)
        if self.shortcut:
            result = self.dropout(result)
            result += x
        return result

mindcv.models.efficientnet.FusedMBConvConfig

Bases: MBConvConfig

FusedMBConvConfig

Source code in mindcv\models\efficientnet.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
class FusedMBConvConfig(MBConvConfig):
    """FusedMBConvConfig"""

    # Stores information listed at Table 4 of the EfficientNetV2 paper
    def __init__(
        self,
        expand_ratio: float,
        kernel_size: int,
        stride: int,
        in_chs: int,
        out_chs: int,
        num_layers: int,
    ) -> None:
        super().__init__(expand_ratio, kernel_size, stride, in_chs, out_chs, num_layers)

mindcv.models.efficientnet.MBConv

Bases: Cell

MBConv Module.

PARAMETER DESCRIPTION
cnf

The class which contains the parameters(in_channels, out_channels, nums_layers) and the functions which help calculate the parameters after multipling the expand_ratio.

TYPE: MBConvConfig

drop_path_prob

The drop path rate in MBConv. Default: 0.2.

TYPE: float DEFAULT: 0.2

norm

The BatchNorm Method. Default: None.

TYPE: Cell DEFAULT: None

se_layer

The squeeze-excite Module. Default: SqueezeExcite.

TYPE: Cell DEFAULT: SqueezeExcite

RETURNS DESCRIPTION

Tensor

Source code in mindcv\models\efficientnet.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
class MBConv(nn.Cell):
    """
    MBConv Module.

    Args:
        cnf (MBConvConfig): The class which contains the parameters(in_channels, out_channels, nums_layers) and
            the functions which help calculate the parameters after multipling the expand_ratio.
        drop_path_prob: The drop path rate in MBConv. Default: 0.2.
        norm (nn.Cell): The BatchNorm Method. Default: None.
        se_layer (nn.Cell): The squeeze-excite Module. Default: SqueezeExcite.

    Returns:
        Tensor
    """

    def __init__(
        self,
        cnf: MBConvConfig,
        drop_path_prob: float = 0.2,
        norm: Optional[nn.Cell] = None,
        se_layer: Callable[..., nn.Cell] = SqueezeExcite,
    ) -> None:
        super().__init__()

        self.shortcut = cnf.stride == 1 and cnf.input_channels == cnf.out_channels

        layers: List[nn.Cell] = []

        # expand conv: the out_channels is cnf.expand_ratio times of the in_channels.
        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
        if expanded_channels != cnf.input_channels:
            layers.extend([
                nn.Conv2d(cnf.input_channels, expanded_channels, kernel_size=1),
                norm(expanded_channels),
                Swish(),
            ])

        # depthwise conv: splits the filter into groups.
        layers.extend([
            nn.Conv2d(expanded_channels, expanded_channels, kernel_size=cnf.kernel_size,
                      stride=cnf.stride, group=expanded_channels),
            norm(expanded_channels),
            Swish(),
        ])

        # squeeze and excitation
        squeeze_channels = max(1, cnf.input_channels // 4)
        layers.append(se_layer(in_channels=expanded_channels, rd_channels=squeeze_channels, act_layer=Swish))

        # project
        layers.extend([
            nn.Conv2d(expanded_channels, cnf.out_channels, kernel_size=1),
            norm(cnf.out_channels),
        ])

        self.block = nn.SequentialCell(layers)
        self.dropout = DropPath(drop_path_prob)
        self.out_channels = cnf.out_channels

    def construct(self, x) -> Tensor:
        result = self.block(x)
        if self.shortcut:
            result = self.dropout(result)
            result += x
        return result

mindcv.models.efficientnet.MBConvConfig

The Parameters of MBConv which need to multiply the expand_ration.

PARAMETER DESCRIPTION
expand_ratio

The Times of the num of out_channels with respect to in_channels.

TYPE: float

kernel_size

The kernel size of the depthwise conv.

TYPE: int

stride

The stride of the depthwise conv.

TYPE: int

in_chs

The input_channels of the MBConv Module.

TYPE: int

out_chs

The output_channels of the MBConv Module.

TYPE: int

num_layers

The num of MBConv Module.

TYPE: int

width_cnf

The ratio of the channel. Default: 1.0.

TYPE: float DEFAULT: 1.0

depth_cnf

The ratio of num_layers. Default: 1.0.

TYPE: float DEFAULT: 1.0

RETURNS DESCRIPTION

None

Examples:

>>> cnf = MBConvConfig(1, 3, 1, 32, 16, 1)
>>> print(cnf.input_channels)
Source code in mindcv\models\efficientnet.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class MBConvConfig:
    """
    The Parameters of MBConv which need to multiply the expand_ration.

    Args:
        expand_ratio (float): The Times of the num of out_channels with respect to in_channels.
        kernel_size (int): The kernel size of the depthwise conv.
        stride (int): The stride of the depthwise conv.
        in_chs (int): The input_channels of the MBConv Module.
        out_chs (int): The output_channels of the MBConv Module.
        num_layers (int): The num of MBConv Module.
        width_cnf: The ratio of the channel. Default: 1.0.
        depth_cnf: The ratio of num_layers. Default: 1.0.

    Returns:
        None

    Examples:
        >>> cnf = MBConvConfig(1, 3, 1, 32, 16, 1)
        >>> print(cnf.input_channels)
    """

    def __init__(
        self,
        expand_ratio: float,
        kernel_size: int,
        stride: int,
        in_chs: int,
        out_chs: int,
        num_layers: int,
        width_cnf: float = 1.0,
        depth_cnf: float = 1.0,
    ) -> None:
        self.expand_ratio = expand_ratio
        self.kernel_size = kernel_size
        self.stride = stride
        self.input_channels = self.adjust_channels(in_chs, width_cnf)
        self.out_channels = self.adjust_channels(out_chs, width_cnf)
        self.num_layers = self.adjust_depth(num_layers, depth_cnf)

    @staticmethod
    def adjust_channels(channels: int, width_cnf: float, min_value: Optional[int] = None) -> int:
        """
        Calculate the width of MBConv.

        Args:
            channels (int): The number of channel.
            width_cnf (float): The ratio of channel.
            min_value (int, optional): The minimum number of channel. Default: None.

        Returns:
            int, the width of MBConv.
        """

        return make_divisible(channels * width_cnf, 8, min_value)

    @staticmethod
    def adjust_depth(num_layers: int, depth_cnf: float) -> int:
        """
        Calculate the depth of MBConv.

        Args:
            num_layers (int): The number of MBConv Module.
            depth_cnf (float): The ratio of num_layers.

        Returns:
            int, the depth of MBConv.
        """

        return int(math.ceil(num_layers * depth_cnf))
mindcv.models.efficientnet.MBConvConfig.adjust_channels(channels, width_cnf, min_value=None) staticmethod

Calculate the width of MBConv.

PARAMETER DESCRIPTION
channels

The number of channel.

TYPE: int

width_cnf

The ratio of channel.

TYPE: float

min_value

The minimum number of channel. Default: None.

TYPE: int DEFAULT: None

RETURNS DESCRIPTION
int

int, the width of MBConv.

Source code in mindcv\models\efficientnet.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@staticmethod
def adjust_channels(channels: int, width_cnf: float, min_value: Optional[int] = None) -> int:
    """
    Calculate the width of MBConv.

    Args:
        channels (int): The number of channel.
        width_cnf (float): The ratio of channel.
        min_value (int, optional): The minimum number of channel. Default: None.

    Returns:
        int, the width of MBConv.
    """

    return make_divisible(channels * width_cnf, 8, min_value)
mindcv.models.efficientnet.MBConvConfig.adjust_depth(num_layers, depth_cnf) staticmethod

Calculate the depth of MBConv.

PARAMETER DESCRIPTION
num_layers

The number of MBConv Module.

TYPE: int

depth_cnf

The ratio of num_layers.

TYPE: float

RETURNS DESCRIPTION
int

int, the depth of MBConv.

Source code in mindcv\models\efficientnet.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
@staticmethod
def adjust_depth(num_layers: int, depth_cnf: float) -> int:
    """
    Calculate the depth of MBConv.

    Args:
        num_layers (int): The number of MBConv Module.
        depth_cnf (float): The ratio of num_layers.

    Returns:
        int, the depth of MBConv.
    """

    return int(math.ceil(num_layers * depth_cnf))

mindcv.models.efficientnet.efficientnet_b0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B0 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
@register_model
def efficientnet_b0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B0 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b0", 1.0, 1.0, 0.2, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B1 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
@register_model
def efficientnet_b1(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B1 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b1", 1.0, 1.1, 0.2, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B2 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
@register_model
def efficientnet_b2(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B2 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b2", 1.1, 1.2, 0.3, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B3 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
@register_model
def efficientnet_b3(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B3 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b3", 1.2, 1.4, 0.3, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B4 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
@register_model
def efficientnet_b4(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B4 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b4", 1.4, 1.8, 0.4, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b5(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B5 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
@register_model
def efficientnet_b5(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B5 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b5", 1.6, 2.2, 0.4, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b6(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B6 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
@register_model
def efficientnet_b6(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B6 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b6", 1.8, 2.6, 0.5, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_b7(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B7 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
@register_model
def efficientnet_b7(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B7 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_b7", 2.0, 3.1, 0.5, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_v2_l(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B4 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
@register_model
def efficientnet_v2_l(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B4 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_v2_l", 1.0, 1.0, 0.2, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_v2_m(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B4 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
@register_model
def efficientnet_v2_m(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B4 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_v2_m", 1.0, 1.0, 0.2, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_v2_s(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B4 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
@register_model
def efficientnet_v2_s(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B4 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_v2_s", 1.0, 1.0, 0.2, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.efficientnet.efficientnet_v2_xl(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Constructs a EfficientNet B4 architecture from EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>_.

PARAMETER DESCRIPTION
pretrained

If True, returns a model pretrained on IMAGENET. Default: False.

TYPE: bool DEFAULT: False

num_classes

The numbers of classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

The input channels. Default: 1000.

TYPE: int DEFAULT: 3

Inputs
  • x (Tensor) - Tensor of shape :math:(N, C_{in}, H_{in}, W_{in}).
Outputs

Tensor of shape :math:(N, CLASSES_{out}).

Source code in mindcv\models\efficientnet.py
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
@register_model
def efficientnet_v2_xl(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> EfficientNet:
    """
    Constructs a EfficientNet B4 architecture from
    `EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks <https://arxiv.org/abs/1905.11946>`_.

    Args:
        pretrained (bool): If True, returns a model pretrained on IMAGENET. Default: False.
        num_classes (int): The numbers of classes. Default: 1000.
        in_channels (int): The input channels. Default: 1000.

    Inputs:
        - **x** (Tensor) - Tensor of shape :math:`(N, C_{in}, H_{in}, W_{in})`.

    Outputs:
        Tensor of shape :math:`(N, CLASSES_{out})`.
    """
    return _efficientnet("efficientnet_v2_xl", 1.0, 1.0, 0.2, in_channels, num_classes, pretrained, **kwargs)

features

mindcv.models.features

mindcv.models.features.FeatureExtractWrapper

Bases: Cell

A wrapper of the original model, aims to perform the feature extraction at each stride. Basically, it performs 3 steps: 1. extract the return node name from the network's property feature_info; 2. partially flatten the network architecture if network's attribute flatten_sequential is True; 3. rebuild the forward steps and output the features based on the return node name.

It also provide a property out_channels in the wrapped model, return the number of features at each output layer. This propery is usually used for the downstream tasks, which requires feature infomation at network build stage.

It should be note that to apply this wrapper, there is a strong assumption that each of the outmost cell are registered in the same order as they are used. And there should be no reuse of each cell, even for the ReLU cell. Otherwise, the returned result may not be correct.

And it should be also note that it basically rebuild the model. So the default checkpoint parameter cannot be loaded correctly once that model is wrapped. To use the pretrained weight, please load the weight first and then use this wrapper to rebuild the model.

PARAMETER DESCRIPTION
net

The model need to be wrapped.

TYPE: Cell

out_indices

The indicies of the output features. Default: [0, 1, 2, 3, 4]

TYPE: list[int] DEFAULT: [0, 1, 2, 3, 4]

Source code in mindcv\models\features.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class FeatureExtractWrapper(nn.Cell):
    """A wrapper of the original model, aims to perform the feature extraction at each stride.
    Basically, it performs 3 steps: 1. extract the return node name from the network's property
    `feature_info`; 2. partially flatten the network architecture if network's attribute `flatten_sequential`
    is True; 3. rebuild the forward steps and output the features based on the return node name.

    It also provide a property `out_channels` in the wrapped model, return the number of features at each output
    layer. This propery is usually used for the downstream tasks, which requires feature infomation at network
    build stage.

    It should be note that to apply this wrapper, there is a strong assumption that each of the outmost cell
    are registered in the same order as they are used. And there should be no reuse of each cell, even for the `ReLU`
    cell. Otherwise, the returned result may not be correct.

    And it should be also note that it basically rebuild the model. So the default checkpoint parameter cannot be loaded
    correctly once that model is wrapped. To use the pretrained weight, please load the weight first and then use this
    wrapper to rebuild the model.

    Args:
        net (nn.Cell): The model need to be wrapped.
        out_indices (list[int]): The indicies of the output features. Default: [0, 1, 2, 3, 4]
    """

    def __init__(self, net: nn.Cell, out_indices: List[int] = [0, 1, 2, 3, 4]) -> None:
        super().__init__(auto_prefix=False)

        feature_info = self._get_feature_info(net)
        self.is_rewritten = getattr(net, "is_rewritten", False)
        flatten_sequetial = getattr(net, "flatten_sequential", False)
        return_layers = _get_return_layers(feature_info, out_indices)
        self.return_index = list()

        if not self.is_rewritten:
            cells = _cell_list(net, flatten_sequential=flatten_sequetial)
            self.net, updated_return_layers = self._create_net(cells, return_layers)

            # calculate the return index
            for i, name in enumerate(self.net.name_cells().keys()):
                if name in updated_return_layers:
                    self.return_index.append(i)
        else:
            self.net = net
            self.return_index = out_indices

        # calculate the out_channels
        self._out_channels = list()
        for i in return_layers.values():
            self._out_channels.append(feature_info[i]["chs"])

    @property
    def out_channels(self):
        """The output channels of the model, filtered by the out_indices.
        """
        return self._out_channels

    def construct(self, x: Tensor) -> List[Tensor]:
        return self._collect(x)

    def _get_feature_info(self, net: nn.Cell) -> Dict[str, Any]:
        try:
            feature_info = getattr(net, "feature_info")
        except AttributeError:
            raise
        return feature_info

    def _create_net(
        self, cells: Iterable[Tuple[str, str, nn.Cell]], return_layers: Dict[str, int]
    ) -> Tuple[nn.SequentialCell, Dict[str, int]]:
        layers = OrderedDict()
        updated_return_layers = dict()
        remaining = set(return_layers.keys())
        for new_name, old_name, module in cells:
            layers[new_name] = module
            if old_name in remaining:
                updated_return_layers[new_name] = return_layers[old_name]
                remaining.remove(old_name)
            if not remaining:
                break

        net = nn.SequentialCell(layers)
        return net, updated_return_layers

    def _collect(self, x: Tensor) -> List[Tensor]:
        out = list()

        if self.is_rewritten:
            xs = self.net(x)

            for i, x in enumerate(xs):
                if i in self.return_index:
                    out.append(x)
        else:
            for i, cell in enumerate(self.net.cell_list):
                x = cell(x)
                if i in self.return_index:
                    out.append(x)

        return out
mindcv.models.features.FeatureExtractWrapper.out_channels property

The output channels of the model, filtered by the out_indices.

ghostnet

mindcv.models.ghostnet

MindSpore implementation of GhostNet. Refer to GhostNet: More Features from Cheap Operations.

mindcv.models.ghostnet.GhostNet

Bases: Cell

GhostNet model class, based on "GhostNet: More Features from Cheap Operations " <https://arxiv.org/abs/1911.11907>_. Args: num_classes: number of classification classes. Default: 1000. width: base width of hidden channel in blocks. Default: 1.0. in_channels: number of input channels. Default: 3. drop_rate: the probability of the features before classification. Default: 0.2.

Source code in mindcv\models\ghostnet.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
class GhostNet(nn.Cell):
    r"""GhostNet model class, based on
    `"GhostNet: More Features from Cheap Operations " <https://arxiv.org/abs/1911.11907>`_.
    Args:
        num_classes: number of classification classes. Default: 1000.
        width: base width of hidden channel in blocks. Default: 1.0.
        in_channels: number of input channels. Default: 3.
        drop_rate: the probability of the features before classification. Default: 0.2.
    """

    def __init__(
        self,
        num_classes: int = 1000,
        width: float = 1.0,
        in_channels: int = 3,
        drop_rate: float = 0.2,
    ) -> None:
        super().__init__()
        # setting of inverted residual blocks
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        self.cfgs = [
            # k, t, c, SE, s
            # stage1
            [[3, 16, 16, 0, 1]],
            # stage2
            [[3, 48, 24, 0, 2]],
            [[3, 72, 24, 0, 1]],
            # stage3
            [[5, 72, 40, 0.25, 2]],
            [[5, 120, 40, 0.25, 1]],
            # stage4
            [[3, 240, 80, 0, 2]],
            [[3, 200, 80, 0, 1],
             [3, 184, 80, 0, 1],
             [3, 184, 80, 0, 1],
             [3, 480, 112, 0.25, 1],
             [3, 672, 112, 0.25, 1]
             ],
            # stage5
            [[5, 672, 160, 0.25, 2]],
            [[5, 960, 160, 0, 1],
             [5, 960, 160, 0.25, 1],
             [5, 960, 160, 0, 1],
             [5, 960, 160, 0.25, 1]
             ]
        ]

        # building first layer
        stem_chs = make_divisible(16 * width, 4)
        self.conv_stem = nn.Conv2d(in_channels, stem_chs, 3, 2, pad_mode="pad", padding=1, has_bias=False)
        self.bn1 = nn.BatchNorm2d(stem_chs)
        self.act1 = nn.ReLU()
        prev_chs = stem_chs

        # building inverted residual blocks
        stages = []
        for cfg in self.cfgs:
            layers = []
            for k, exp_size, c, se_ratio, s in cfg:
                out_chs = make_divisible(c * width, 4)
                mid_chs = make_divisible(exp_size * width, 4)
                layers.append(GhostBottleneck(prev_chs, mid_chs, out_chs, k, s, se_ratio=se_ratio))
                prev_chs = out_chs
            stages.append(nn.SequentialCell(layers))

        out_chs = make_divisible(exp_size * width, 4)
        stages.append(ConvBnAct(prev_chs, out_chs, 1))
        prev_chs = out_chs

        self.blocks = nn.SequentialCell(stages)

        # building last several layers
        self.num_features = out_chs = 1280
        self.global_pool = GlobalAvgPooling(keep_dims=True)
        self.conv_head = nn.Conv2d(prev_chs, out_chs, 1, 1, pad_mode="pad", padding=0, has_bias=True)
        self.act2 = nn.ReLU()
        self.flatten = nn.Flatten()
        if self.drop_rate > 0.0:
            self.dropout = Dropout(p=drop_rate)
        self.classifier = nn.Dense(out_chs, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.conv_stem(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.blocks(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.global_pool(x)
        x = self.conv_head(x)
        x = self.act2(x)
        x = self.flatten(x)
        if self.drop_rate > 0.0:
            x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.ghostnet.HardSigmoid

Bases: Cell

Implementation for (relu6 + 3) / 6

Source code in mindcv\models\ghostnet.py
41
42
43
44
45
46
47
48
49
class HardSigmoid(nn.Cell):
    """Implementation for (relu6 + 3) / 6"""

    def __init__(self) -> None:
        super().__init__()
        self.relu6 = nn.ReLU6()

    def construct(self, x: Tensor) -> Tensor:
        return self.relu6(x + 3.0) / 6.0

mindcv.models.ghostnet.ghostnet_050(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

GhostNet-0.5x

Source code in mindcv\models\ghostnet.py
298
299
300
301
302
303
304
305
306
307
@register_model
def ghostnet_050(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """ GhostNet-0.5x """
    default_cfg = default_cfgs["ghostnet_050"]
    model = GhostNet(width=0.5, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.ghostnet.ghostnet_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

GhostNet-1.0x

Source code in mindcv\models\ghostnet.py
310
311
312
313
314
315
316
317
318
319
@register_model
def ghostnet_100(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """ GhostNet-1.0x """
    default_cfg = default_cfgs["ghostnet_100"]
    model = GhostNet(width=1.0, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.ghostnet.ghostnet_130(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

GhostNet-1.3x

Source code in mindcv\models\ghostnet.py
322
323
324
325
326
327
328
329
330
331
@register_model
def ghostnet_130(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """ GhostNet-1.3x """
    default_cfg = default_cfgs["ghostnet_130"]
    model = GhostNet(width=1.3, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

halonet

mindcv.models.halonet

MindSpore implementation of HaloNet. Refer to Scaling Local Self-Attention for Parameter Effificient Visual Backbones.

mindcv.models.halonet.ActLayer

Bases: Cell

Build Activation Layer according to act type

Source code in mindcv\models\halonet.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class ActLayer(nn.Cell):
    """ Build Activation Layer according to act type
    """
    def __init__(self, act):
        super().__init__()
        if act == 'silu':
            self.act = nn.SiLU()
        elif act == 'relu':
            self.act = nn.ReLU()
        else:
            self.act = Identity()

    def construct(self, inputs):
        out = self.act(inputs)
        return out

mindcv.models.halonet.BatchNormAct2d

Bases: Cell

Build layer contain: bn-act

Source code in mindcv\models\halonet.py
87
88
89
90
91
92
93
94
95
96
97
98
class BatchNormAct2d(nn.Cell):
    """ Build layer contain: bn-act
    """
    def __init__(self, chs, act=None):
        super().__init__()
        self.bn = nn.BatchNorm2d(chs)
        self.act = ActLayer(act)

    def construct(self, inputs):
        out = self.bn(inputs)
        out = self.act(out)
        return out

mindcv.models.halonet.BottleneckBlock

Bases: Cell

ResNet-like Bottleneck Block - 1x1 - kxk - 1x1

Source code in mindcv\models\halonet.py
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
class BottleneckBlock(nn.Cell):
    """ ResNet-like Bottleneck Block - 1x1 - kxk - 1x1
    """
    def __init__(self,
                 in_chs,
                 out_chs,
                 stride,
                 act,
                 downsample=None,
                 shortcut=None,
                 ):
        super().__init__()
        self.stride = stride
        mid_chs = out_chs//4
        self.conv1_1x1 = ConvBnAct(
                                   in_chs,
                                   mid_chs,
                                   kernel_size=1,
                                   stride=1,
                                   padding=0,
                                   act=act)
        self.conv2_kxk = ConvBnAct(
                                   mid_chs,
                                   mid_chs,
                                   kernel_size=3,
                                   stride=self.stride,
                                   padding=1,
                                   act=act)
        self.conv2b_kxk = Identity()
        self.conv3_1x1 = ConvBnAct(
                                   mid_chs,
                                   out_chs,
                                   kernel_size=1,
                                   stride=1,
                                   padding=0)
        self.attn = Identity()
        self.attn_last = Identity()
        self.shortcut = shortcut
        if self.shortcut:
            if downsample:
                self.creat_shortcut = ConvBnAct(
                                                in_chs,
                                                out_chs,
                                                kernel_size=1,
                                                stride=self.stride,
                                                padding=0)
            else:
                self.creat_shortcut = ConvBnAct(
                                                in_chs,
                                                out_chs,
                                                kernel_size=1,
                                                stride=1,
                                                padding=0)
        self.Identity = Identity()
        self.act = ActLayer(act)

    def construct(self, x):
        h = x
        x = self.conv1_1x1(x)
        x = self.conv2_kxk(x)
        x = self.conv2b_kxk(x)
        x = self.attn(x)
        x = self.conv3_1x1(x)
        out = self.attn_last(x)
        if self.shortcut:
            h = self.creat_shortcut(h)
        else:
            h = self.Identity(h)
        out = out + h
        out = self.act(out)
        return out

mindcv.models.halonet.ConvBnAct

Bases: Cell

Build layer contain: conv - bn - act

Source code in mindcv\models\halonet.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class ConvBnAct(nn.Cell):
    """ Build layer contain: conv - bn - act
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 padding,
                 act=None,
                 bias_init=False
                 ):
        super().__init__()
        self.conv = nn.Conv2d(in_channels=in_channels,
                              out_channels=out_channels,
                              kernel_size=kernel_size,
                              stride=stride,
                              pad_mode="pad",
                              padding=padding,
                              weight_init=HeUniform(),
                              has_bias=bias_init
                              )
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = ActLayer(act)

    def construct(self, inputs):
        out = self.conv(inputs)
        out = self.bn(out)
        out = self.act(out)
        return out

mindcv.models.halonet.HaloAttention

Bases: Cell

The internal dimensions of the attention module are controlled by the interaction of several arguments. the output dimension : dim_out the value(v) dimension : dim_out//num_heads the query(q) and key(k) dimensions are determined by : * num_heads*dim_head * num_heads*(dim_out*attn_ratio//num_heads) the ratio of q and k relative to the output : attn_ratio

PARAMETER DESCRIPTION
dim

input dimension to the module

TYPE: int

dim_out

output dimension of the module, same as dim if not set

TYPE: int DEFAULT: None

feat_size

size of input feature_map (not used, for arg compat with bottle/lambda)

TYPE: Tuple[int, int] DEFAULT: None

stride

output stride of the module, query downscaled if > 1 (default: 1).

DEFAULT: 1

num_heads

parallel attention heads (default: 8).

DEFAULT: 8

dim_head

dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set

DEFAULT: None

block_size

size of blocks. (default: 8)

TYPE: int DEFAULT: 8

halo_size

size of halo overlap. (default: 3)

TYPE: int DEFAULT: 3

qk_ratio

ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)

TYPE: float DEFAULT: 1.0

qkv_bias

add bias to q, k, and v projections

TYPE: bool) DEFAULT: False

avg_down

use average pool downsample instead of strided query blocks

TYPE: bool DEFAULT: False

scale_pos_embed

scale the position embedding as well as Q @ K

TYPE: bool DEFAULT: False

Source code in mindcv\models\halonet.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
class HaloAttention(nn.Cell):
    """
    The internal dimensions of the attention module are controlled by
    the interaction of several arguments.
    the output dimension : dim_out
    the value(v) dimension :  dim_out//num_heads
    the query(q) and key(k) dimensions are determined by :
         * num_heads*dim_head
         * num_heads*(dim_out*attn_ratio//num_heads)
    the ratio of q and k relative to the output : attn_ratio

    Args:
        dim (int): input dimension to the module
        dim_out (int): output dimension of the module, same as dim if not set
        feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda)
        stride: output stride of the module, query downscaled if > 1 (default: 1).
        num_heads: parallel attention heads (default: 8).
        dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set
        block_size (int): size of blocks. (default: 8)
        halo_size (int): size of halo overlap. (default: 3)
        qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0)
        qkv_bias (bool) : add bias to q, k, and v projections
        avg_down (bool): use average pool downsample instead of strided query blocks
        scale_pos_embed (bool): scale the position embedding as well as Q @ K
    """
    def __init__(self,
                 dim,
                 dim_out=None,
                 feat_size=None,
                 stride=1,
                 num_heads=8,
                 dim_head=None,
                 block_size=8,
                 halo_size=3,
                 qk_ratio=1.0,  # ratio of q and k dimensions to output dimension when dim_head not set.
                 qkv_bias=False,
                 avg_down=False,  # use average pool downsample instead of strided query blocks
                 scale_pos_embed=False):  # scale the position embedding as well as Q @ K
        super().__init__()
        dim_out = dim_out or dim
        assert dim_out % num_heads == 0
        self.stride = stride
        self.num_heads = num_heads  # 8
        self.dim_head_qk = make_divisible(dim_out * qk_ratio, divisor=8) // num_heads
        self.dim_head_v = dim_out // self.num_heads  # dimension of head
        self.dim_out_qk = num_heads * self.dim_head_qk
        self.dim_out_v = num_heads * self.dim_head_v  # dimension of dim_out_v
        self.scale = self.dim_head_qk ** -0.5
        self.scale_pos_embed = scale_pos_embed
        self.block_size = self.block_size_ds = block_size
        self.halo_size = halo_size
        self.win_size = block_size + halo_size * 2  # neighbourhood window size
        self.block_stride = stride
        use_avg_pool = False
        if stride > 1:
            use_avg_pool = avg_down or block_size % stride != 0
            self.block_stride = stride
            self.block_size_ds = self.block_size // self.block_stride
        self.q = nn.Conv2d(dim,
                           self.dim_out_qk,
                           1,
                           stride=self.block_stride,
                           has_bias=qkv_bias,
                           weight_init=HeUniform())
        self.kv = nn.Conv2d(dim, self.dim_out_qk + self.dim_out_v, 1, has_bias=qkv_bias)
        self.pos_embed = RelPosEmb(
            block_size=self.block_size_ds, win_size=self.win_size, dim_head=self.dim_head_qk)
        self.pool = nn.AvgPool2d(2, 2) if use_avg_pool else Identity()
        self.softmax_fn = ops.Softmax(-1)
        self.pad_kv = ops.Pad(
            paddings=((0, 0), (0, 0), (self.halo_size, self.halo_size), (self.halo_size, self.halo_size))
            )
        self.kv_unfold = nn.Unfold(
            ksizes=[1, self.win_size, self.win_size, 1],
            strides=[1, self.block_size, self.block_size, 1],
            rates=[1, 1, 1, 1],
            padding='valid'
            )

    def construct(self, x):
        B, C, H, W = x.shape
        assert H % self.block_size == 0 and W % self.block_size == 0, 'fmap dimensions must be divisible'
        num_h_blocks = H//self.block_size
        num_w_blocks = W//self.block_size
        num_blocks = num_h_blocks * num_w_blocks
        q = self.q(x)
        # unfold
        q = ops.reshape(q, (-1, self.dim_head_qk, num_h_blocks, self.block_size_ds, num_w_blocks, self.block_size_ds))
        q = ops.transpose(q, (0, 1, 3, 5, 2, 4))
        q = ops.reshape(q, (B*self.num_heads, self.dim_head_qk, -1, num_blocks))
        q = ops.transpose(q, (0, 3, 2, 1))  # B*num_heads,num_blocks,block_size**2, dim_head
        kv = self.kv(x)  # [bs,dim_out,H,W]
        kv = self.pad_kv(kv)
        kv = self.kv_unfold(kv)  # B, C_kh_kw, _, _
        kv = ops.reshape(kv, (B * self.num_heads, self.dim_head_qk + self.dim_head_v, -1, num_blocks))
        kv = ops.transpose(kv, (0, 3, 2, 1))  # [B * self.num_heads, num_blocks, -1, self.dim_head_qk + self.dim_head_v]
        k = kv[..., :self.dim_head_qk]
        v = kv[..., self.dim_head_qk:(self.dim_head_qk + self.dim_head_v)]
        k = ops.transpose(k, (0, 1, 3, 2))  # [B * self.num_heads, num_blocks, self.dim_head_qk, -1]
        if self.scale_pos_embed:
            attn = (ops.matmul(q, k) + self.pos_embed(q)) * self.scale
        else:
            pos_embed_q = self.pos_embed(q)
            part_1 = (ops.matmul(q, k)) * self.scale
            attn = part_1 + pos_embed_q
        # attn: B * num_heads, num_blocks, block_size ** 2, win_size ** 2
        attn = self.softmax_fn(attn)
        # attn = attn @ v
        attn = ops.matmul(attn, v)  # attn: B * num_heads, num_blocks, block_size ** 2, dim_head_v
        out = ops.transpose(attn, (0, 3, 2, 1))  # B * num_heads, dim_head_v, block_size ** 2, num_blocks
        # fold
        out = ops.reshape(out, (-1, self.block_size_ds, self.block_size_ds, num_h_blocks, num_w_blocks))
        # -1, num_h_blocks, self.block_size_ds, num_w_blocks, self.block_size_ds
        out = ops.transpose(out, (0, 3, 1, 4, 2))
        out = ops.reshape(out, (B, self.dim_out_v, H // self.block_stride, W // self.block_stride))
        # B, dim_out, H // block_stride, W // block_stride
        out = self.pool(out)
        return out

mindcv.models.halonet.HaloNet

Bases: Cell

Define main structure of HaloNet: stem - blocks - head

Source code in mindcv\models\halonet.py
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
class HaloNet(nn.Cell):
    """ Define main structure of HaloNet: stem - blocks - head
    """
    def __init__(self,
                 depth_list,
                 block_size,
                 halo_size,
                 stage1_block,
                 stage2_block,
                 stage3_block,
                 stage4_block,
                 chs_list,
                 num_heads,
                 num_classes,
                 stride_list,
                 hidden_chs,
                 act,
                 ):
        super().__init__()
        self.stem = Stem(act)
        self.stage1 = HaloStage(
                                block_types=stage1_block,
                                block_size=block_size,
                                halo_size=halo_size,
                                depth=depth_list[0],
                                channel=chs_list[0],
                                out_channel=chs_list[1],
                                stride=stride_list[0],
                                num_head=num_heads[0],
                                hidden_chs=hidden_chs,
                                act=act,
                                )
        self.stage2 = HaloStage(
                                block_types=stage2_block,
                                block_size=block_size,
                                halo_size=halo_size,
                                depth=depth_list[1],
                                channel=chs_list[1],
                                out_channel=chs_list[2],
                                stride=stride_list[1],
                                num_head=num_heads[1],
                                hidden_chs=hidden_chs,
                                act=act,
                                downsample=True)
        self.stage3 = HaloStage(
                                block_types=stage3_block,
                                block_size=block_size,
                                halo_size=halo_size,
                                depth=depth_list[2],
                                channel=chs_list[2],
                                out_channel=chs_list[3],
                                stride=stride_list[2],
                                num_head=num_heads[2],
                                hidden_chs=hidden_chs,
                                act=act,
                                downsample=True)
        self.stage4 = HaloStage(
                                block_types=stage4_block,
                                block_size=block_size,
                                halo_size=halo_size,
                                depth=depth_list[3],
                                channel=chs_list[3],
                                out_channel=chs_list[4],
                                stride=stride_list[3],
                                num_head=num_heads[3],
                                hidden_chs=hidden_chs,
                                act=act,
                                downsample=True)
        self.classifier = nn.SequentialCell([
            SelectAdaptivePool2d(flatten=True),
            nn.Dense(chs_list[4], num_classes, TruncatedNormal(.02), bias_init='zeros'),
            Identity()]
        )
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def construct(self, x):
        x = self.stem(x)
        out_stage1 = self.stage1(x)
        out_stage2 = self.stage2(out_stage1)
        out_stage3 = self.stage3(out_stage2)
        out_stage4 = self.stage4(out_stage3)
        out = self.classifier(out_stage4)
        return out

mindcv.models.halonet.HaloStage

Bases: Cell

Stage layers for HaloNet. Stage layers contains a number of Blocks.

Source code in mindcv\models\halonet.py
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
class HaloStage(nn.Cell):
    """ Stage layers for HaloNet. Stage layers contains a number of Blocks.
    """
    def __init__(self,
                 block_types,
                 block_size,
                 halo_size,
                 depth,
                 channel,
                 out_channel,
                 stride,
                 num_head,
                 act,
                 hidden_chs=None,
                 downsample=None,
                 ):
        super().__init__()
        self.depth = depth
        blocks = []
        for idx in range(depth):
            if idx == 0:
                shortcut = True
                in_channel = channel
                if downsample is None:
                    self.down = False
                else:
                    self.down = downsample
                block_stride = stride
                self.hidden = hidden_chs
            else:
                stride = 1
                shortcut = False
                in_channel = out_channel
                self.down = False
                block_stride = 1
                self.hidden = None

            block_type = block_types[idx]
            if block_type == 'bottle':
                blocks.append(
                    BottleneckBlock(
                        in_chs=in_channel,
                        out_chs=out_channel,
                        stride=block_stride,
                        shortcut=shortcut,
                        downsample=self.down,
                        act=act,
                    )
                )
            if block_type == 'attn':
                if num_head > 0:
                    blocks.append(
                        SelfAttnBlock(
                            chs=out_channel,
                            stride=stride,
                            num_heads=num_head,
                            block_size=block_size,
                            halo_size=halo_size,
                            hidden_chs=self.hidden,
                            shortcut=shortcut,
                            act=act,
                        )
                    )
        self.blocks = nn.CellList(blocks)

    def construct(self, x):
        for stage in self.blocks:
            x = stage(x)
        return x

mindcv.models.halonet.RelPosEmb

Bases: Cell

Relative Position Embedding

Source code in mindcv\models\halonet.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class RelPosEmb(nn.Cell):
    """ Relative Position Embedding
    """
    def __init__(
            self,
            block_size,
            win_size,
            dim_head,
            ):
        """
        :param block_size (int): block size
        :param win_size (int): neighbourhood window size
        :param dim_head (int): attention head dim
        :param scale (float): scale factor (for init)
        """
        super().__init__()
        self.block_size = block_size
        self.dim_head = dim_head
        tensor1 = Tensor(shape=((2 * win_size - 1), dim_head), dtype=ms.float32, init=TruncatedNormal(sigma=.02))
        self.rel_height = Parameter(tensor1)
        tensor2 = Tensor(shape=((2 * win_size - 1), dim_head), dtype=ms.float32, init=TruncatedNormal(sigma=.02))
        self.rel_width = Parameter(tensor2)

    def construct(self, q):
        B, BB, HW, _ = q.shape
        # relative logits in width dimension
        q = ops.reshape(q, (-1, self.block_size, self.block_size, self.dim_head))
        rel_logits_w = rel_logits_1d(q, self.rel_width, permute_mask=(0, 1, 3, 2, 4))
        # relative logits in height dimension
        q = ops.transpose(q, (0, 2, 1, 3))
        rel_logits_h = rel_logits_1d(q, self.rel_height, permute_mask=(0, 3, 1, 4, 2))
        rel_logits = rel_logits_h+rel_logits_w
        rel_logits = ops.reshape(rel_logits, (B, BB, HW, -1))
        return rel_logits
mindcv.models.halonet.RelPosEmb.__init__(block_size, win_size, dim_head)

:param block_size (int): block size :param win_size (int): neighbourhood window size :param dim_head (int): attention head dim :param scale (float): scale factor (for init)

Source code in mindcv\models\halonet.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def __init__(
        self,
        block_size,
        win_size,
        dim_head,
        ):
    """
    :param block_size (int): block size
    :param win_size (int): neighbourhood window size
    :param dim_head (int): attention head dim
    :param scale (float): scale factor (for init)
    """
    super().__init__()
    self.block_size = block_size
    self.dim_head = dim_head
    tensor1 = Tensor(shape=((2 * win_size - 1), dim_head), dtype=ms.float32, init=TruncatedNormal(sigma=.02))
    self.rel_height = Parameter(tensor1)
    tensor2 = Tensor(shape=((2 * win_size - 1), dim_head), dtype=ms.float32, init=TruncatedNormal(sigma=.02))
    self.rel_width = Parameter(tensor2)

mindcv.models.halonet.SelectAdaptivePool2d

Bases: Cell

Selectable global pooling layer with dynamic input kernel size

Source code in mindcv\models\halonet.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class SelectAdaptivePool2d(nn.Cell):
    """ Selectable global pooling layer with dynamic input kernel size
    """
    def __init__(self, pool_type='avg', flatten=False):
        super().__init__()
        # convert other false values to empty string for consistent TS typing
        self.pool_type = pool_type or ''
        self.flatten = nn.Flatten() if flatten else Identity()
        if pool_type == '':
            self.pool = Identity()
        elif pool_type == 'avg':
            self.pool = ops.ReduceMean(keep_dims=True)
        else:
            assert False, 'Invalid pool type: %s' % pool_type

    def construct(self, inputs):
        out = self.pool(inputs, (2, 3))
        out = self.flatten(out)
        return out

mindcv.models.halonet.SelfAttnBlock

Bases: Cell

ResNet-like Bottleneck Block - 1x1 -kxk - self attn -1x1

Source code in mindcv\models\halonet.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
class SelfAttnBlock(nn.Cell):
    """ ResNet-like Bottleneck Block - 1x1 -kxk - self attn -1x1
    """
    def __init__(self,
                 chs,
                 num_heads,
                 block_size,
                 halo_size,
                 act,
                 stride=None,
                 shortcut=None,
                 hidden_chs=None,
                 ):
        super().__init__()
        mid_chs = chs//4
        if hidden_chs is None:
            out_chs = chs
        else:
            out_chs = hidden_chs

        if stride is None:
            self.stride = 1
        else:
            self.stride = stride
        self.conv1_1x1 = ConvBnAct(out_chs, mid_chs, kernel_size=1, stride=1, padding=0, act=act)
        self.conv2_kxk = Identity()
        self.conv3_1x1 = ConvBnAct(mid_chs, chs, kernel_size=1, stride=1, padding=0)
        self.self_attn = HaloAttention(mid_chs,
                                       dim_out=mid_chs,
                                       block_size=block_size,
                                       halo_size=halo_size,
                                       num_heads=num_heads,
                                       stride=self.stride)
        self.post_attn = BatchNormAct2d(mid_chs, act=act)
        self.shortcut = shortcut
        if self.shortcut:
            self.creat_shortcut = ConvBnAct(
                                            out_chs,
                                            chs,
                                            kernel_size=1,
                                            stride=self.stride,
                                            padding=0)
        self.Identity = Identity()
        self.act = ActLayer(act=act)

    def construct(self, x):
        h = x
        out = self.conv1_1x1(x)
        out = self.self_attn(out)
        out = self.post_attn(out)
        out = self.conv3_1x1(out)
        if self.shortcut:
            h = self.creat_shortcut(h)
        else:
            h = self.Identity(h)
        out = out + h
        out = self.act(out)
        return out

mindcv.models.halonet.halonet_50t(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get HaloNet model. Refer to the base class models.HaloNet for more details.

Source code in mindcv\models\halonet.py
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
@register_model
def halonet_50t(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get HaloNet model.
    Refer to the base class `models.HaloNet` for more details."""
    default_cfg = default_cfgs["halonet_50t"]
    model = HaloNet(
        depth_list=[3, 4, 6, 3],
        stage1_block=['bottle', 'bottle', 'bottle'],
        stage2_block=['bottle', 'bottle', 'bottle', 'attn'],
        stage3_block=['bottle', 'attn', 'bottle', 'attn', 'bottle', 'attn'],
        stage4_block=['bottle', 'attn', 'bottle'],
        chs_list=[64, 256, 512, 1024, 2048],
        num_heads=[0, 4, 8, 8],
        num_classes=num_classes,
        stride_list=[1, 2, 2, 2],
        block_size=8,
        halo_size=3,
        hidden_chs=None,
        act='silu',
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.halonet.rel_logits_1d(q, rel_k, permute_mask)

Compute relative logits along one dimension :param q: [batch,H,W,dim] :param rel_k: [2*window-1,dim] :param permute_mask: permute output axis according to this

Source code in mindcv\models\halonet.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def rel_logits_1d(q, rel_k, permute_mask):
    """ Compute relative logits along one dimension
    :param q: [batch,H,W,dim]
    :param rel_k: [2*window-1,dim]
    :param permute_mask: permute output axis according to this
    """
    B, H, W, _ = q.shape
    rel_size = rel_k.shape[0]
    win_size = (rel_size+1)//2
    rel_k = ops.transpose(rel_k, (1, 0))
    x = msnp.tensordot(q, rel_k, axes=1)
    x = ops.reshape(x, (-1, W, rel_size))
    # pad to shift from relative to absolute indexing
    x_pad = ops.pad(x, paddings=((0, 0), (0, 0), (0, 1)))
    x_pad = ops.flatten(x_pad)
    x_pad = ops.expand_dims(x_pad, 1)
    x_pad = ops.pad(x_pad, paddings=((0, 0), (0, 0), (0, rel_size - W)))
    x_pad = ops.squeeze(x_pad, axis=())
    # reshape adn slice out the padded elements
    x_pad = ops.reshape(x_pad, (-1, W+1, rel_size))
    x = x_pad[:, :W, win_size-1:]
    # reshape and tile
    x = ops.reshape(x, (B, H, 1, W, win_size))
    x = ops.broadcast_to(x, (B, H, win_size, W, win_size))
    x = ops.transpose(x, permute_mask)
    return x

hrnet

mindcv.models.hrnet

MindSpore implementation of HRNet. Refer to Deep High-Resolution Representation Learning for Visual Recognition

mindcv.models.hrnet.BasicBlock

Bases: Cell

Basic block of HRNet

Source code in mindcv\models\hrnet.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
class BasicBlock(nn.Cell):
    """Basic block of HRNet"""

    expansion: int = 1

    def __init__(
        self,
        in_channels: int,
        channels: int,
        stride: int = 1,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        down_sample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d
        assert groups == 1, "BasicBlock only supports groups=1"
        assert base_width == 64, "BasicBlock only supports base_width=64"

        self.conv1 = nn.Conv2d(
            in_channels,
            channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            pad_mode="pad",
        )
        self.bn1 = norm(channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(
            channels, channels, kernel_size=3, stride=1, padding=1, pad_mode="pad"
        )
        self.bn2 = norm(channels)
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.down_sample is not None:
            identity = self.down_sample(x)

        out += identity
        out = self.relu(out)

        return out

mindcv.models.hrnet.Bottleneck

Bases: Cell

Bottleneck block of HRNet

Source code in mindcv\models\hrnet.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
class Bottleneck(nn.Cell):
    """Bottleneck block of HRNet"""

    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        stride: int = 1,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        down_sample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d

        width = int(channels * (base_width / 64.0)) * groups

        self.conv1 = nn.Conv2d(in_channels, width, kernel_size=1, stride=1)
        self.bn1 = norm(width)
        self.conv2 = nn.Conv2d(
            width,
            width,
            kernel_size=3,
            stride=stride,
            padding=1,
            pad_mode="pad",
            group=groups,
        )
        self.bn2 = norm(width)
        self.conv3 = nn.Conv2d(
            width, channels * self.expansion, kernel_size=1, stride=1
        )
        self.bn3 = norm(channels * self.expansion)
        self.relu = nn.ReLU()
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.down_sample is not None:
            identity = self.down_sample(x)

        out += identity
        out = self.relu(out)

        return out

mindcv.models.hrnet.HRModule

Bases: Cell

High-Resolution Module for HRNet. In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange is in this module.

Source code in mindcv\models\hrnet.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
class HRModule(nn.Cell):
    """High-Resolution Module for HRNet.
    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
    is in this module.
    """

    def __init__(
        self,
        num_branches: int,
        block: Type[Union[BasicBlock, Bottleneck]],
        num_blocks: List[int],
        num_inchannels: List[int],
        num_channels: List[int],
        multi_scale_output: bool = True,
    ) -> None:
        super().__init__()
        self._check_branches(num_branches, num_blocks, num_inchannels, num_channels)

        self.num_inchannels = num_inchannels
        self.num_branches = num_branches

        self.multi_scale_output = multi_scale_output

        self.branches = self._make_branches(
            num_branches, block, num_blocks, num_channels
        )
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU()

    @staticmethod
    def _check_branches(
        num_branches: int,
        num_blocks: List[int],
        num_inchannels: List[int],
        num_channels: List[int],
    ) -> None:
        """Check input to avoid ValueError."""
        if num_branches != len(num_blocks):
            error_msg = f"NUM_BRANCHES({num_branches})!= NUM_BLOCKS({len(num_blocks)})"
            raise ValueError(error_msg)

        if num_branches != len(num_channels):
            error_msg = (
                f"NUM_BRANCHES({num_branches})!= NUM_CHANNELS({len(num_channels)})"
            )
            raise ValueError(error_msg)

        if num_branches != len(num_inchannels):
            error_msg = (
                f"NUM_BRANCHES({num_branches}) != NUM_INCHANNELS({len(num_inchannels)})"
            )
            raise ValueError(error_msg)

    def _make_one_branch(
        self,
        branch_index: int,
        block: Type[Union[BasicBlock, Bottleneck]],
        num_blocks: List[int],
        num_channels: List[int],
        stride: int = 1,
    ) -> nn.SequentialCell:
        downsample = None
        if stride != 1 or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.SequentialCell(
                nn.Conv2d(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index] * block.expansion,
                    kernel_size=1,
                    stride=stride,
                ),
                nn.BatchNorm2d(num_channels[branch_index] * block.expansion),
            )

        layers = []
        layers.append(
            block(
                self.num_inchannels[branch_index],
                num_channels[branch_index],
                stride,
                down_sample=downsample,
            )
        )
        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
        for _ in range(1, num_blocks[branch_index]):
            layers.append(
                block(self.num_inchannels[branch_index], num_channels[branch_index])
            )

        return nn.SequentialCell(layers)

    def _make_branches(
        self,
        num_branches: int,
        block: Type[Union[BasicBlock, Bottleneck]],
        num_blocks: List[int],
        num_channels: List[int],
    ) -> nn.CellList:
        """Make branches."""
        branches = []

        for i in range(num_branches):
            branches.append(self._make_one_branch(i, block, num_blocks, num_channels))

        return nn.CellList(branches)

    def _make_fuse_layers(self) -> nn.CellList:
        if self.num_branches == 1:
            return None

        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(
                        nn.SequentialCell(
                            nn.Conv2d(
                                num_inchannels[j], num_inchannels[i], kernel_size=1
                            ),
                            nn.BatchNorm2d(num_inchannels[i]),
                        )
                    )
                elif j == i:
                    fuse_layer.append(IdentityCell())
                else:
                    conv3x3s = []
                    for k in range(i - j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(
                                nn.SequentialCell(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        kernel_size=3,
                                        stride=2,
                                        padding=1,
                                        pad_mode="pad",
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3),
                                )
                            )
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(
                                nn.SequentialCell(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        kernel_size=3,
                                        stride=2,
                                        padding=1,
                                        pad_mode="pad",
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3),
                                    nn.ReLU(),
                                )
                            )
                    fuse_layer.append(nn.SequentialCell(conv3x3s))
            fuse_layers.append(nn.CellList(fuse_layer))

        return nn.CellList(fuse_layers)

    def construct(self, x: List[Tensor]) -> List[Tensor]:
        if self.num_branches == 1:
            return [self.branches[0](x[0])]

        x2 = []
        for i in range(self.num_branches):
            x2.append(self.branches[i](x[i]))

        x_fuse = []

        for i in range(len(self.fuse_layers)):
            y = x2[0] if i == 0 else self.fuse_layers[i][0](x2[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x2[j]
                elif j > i:
                    _, _, height, width = x2[i].shape
                    t = self.fuse_layers[i][j](x2[j])
                    t = ops.cast(t, ms.float32)
                    t = ops.ResizeNearestNeighbor((height, width))(t)
                    t = ops.cast(t, ms.float16)
                    y = y + t
                else:
                    y = y + self.fuse_layers[i][j](x2[j])
            x_fuse.append(self.relu(y))

        if not self.multi_scale_output:
            x_fuse = x_fuse[0]

        return x_fuse

mindcv.models.hrnet.HRNet

Bases: Cell

HRNet Backbone, based on "Deep High-Resolution Representation Learning for Visual Recognition" <https://arxiv.org/abs/1908.07919>_.

PARAMETER DESCRIPTION
stage_cfg

Configuration of the extra blocks. It accepts a dictionay storing the detail config of each block. which include num_modules, num_branches, block, num_blocks, num_channels. For detail example, please check the implementation of hrnet_w32 and hrnet_w48.

TYPE: Dict[str, Dict[str, int]]

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

Number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

Source code in mindcv\models\hrnet.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
class HRNet(nn.Cell):
    r"""HRNet Backbone, based on
    `"Deep High-Resolution Representation Learning for Visual Recognition"
    <https://arxiv.org/abs/1908.07919>`_.

    Args:
        stage_cfg: Configuration of the extra blocks. It accepts a dictionay
            storing the detail config of each block. which include `num_modules`,
            `num_branches`, `block`, `num_blocks`, `num_channels`. For detail example,
            please check the implementation of `hrnet_w32` and `hrnet_w48`.
        num_classes: number of classification classes. Default: 1000.
        in_channels: Number the channels of the input. Default: 3.
    """

    blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}

    def __init__(
        self,
        stage_cfg: Dict[str, Dict[str, int]],
        num_classes: int = 1000,
        in_channels: int = 3,
    ) -> None:
        super().__init__()

        self.stage_cfg = stage_cfg
        # stem net
        self.conv1 = nn.Conv2d(
            in_channels, 64, kernel_size=3, stride=2, padding=1, pad_mode="pad"
        )
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(
            64, 64, kernel_size=3, stride=2, padding=1, pad_mode="pad"
        )
        self.bn2 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()

        # stage 1
        self.stage1_cfg = self.stage_cfg["stage1"]
        num_channels = self.stage1_cfg["num_channels"][0]
        num_blocks = self.stage1_cfg["num_blocks"][0]
        block = self.blocks_dict[self.stage1_cfg["block"]]
        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)

        # stage 2
        self.stage2_cfg = self.stage_cfg["stage2"]
        num_channels = self.stage2_cfg["num_channels"]
        block = self.blocks_dict[self.stage2_cfg["block"]]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]

        self.transition1, self.transition1_flags = self._make_transition_layer(
            [256], num_channels
        )
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels
        )

        # stage 3
        self.stage3_cfg = self.stage_cfg["stage3"]
        num_channels = self.stage3_cfg["num_channels"]
        block = self.blocks_dict[self.stage3_cfg["block"]]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]

        self.transition2, self.transition2_flags = self._make_transition_layer(
            pre_stage_channels, num_channels
        )
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels
        )

        # stage 4
        self.stage4_cfg = self.stage_cfg["stage4"]
        num_channels = self.stage4_cfg["num_channels"]
        block = self.blocks_dict[self.stage4_cfg["block"]]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition3, self.transition3_flags = self._make_transition_layer(
            pre_stage_channels, num_channels
        )
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels
        )

        # head
        self.pool = GlobalAvgPooling()
        self.incre_modules, self.downsample_modules, self.final_layer = self._make_head(
            pre_stage_channels
        )
        self.classifier = nn.Dense(2048, num_classes)

    def _make_head(self, pre_stage_channels: List[int]):
        head_block = Bottleneck
        head_channels = [32, 64, 128, 256]

        # increase the #channesl on each resolution
        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
        incre_modules = list()
        for i, channels in enumerate(pre_stage_channels):
            incre_module = self._make_layer(
                head_block, channels, head_channels[i], 1, stride=1
            )
            incre_modules.append(incre_module)
        incre_modules = nn.CellList(incre_modules)

        # downsample modules
        downsamp_modules = []
        for i in range(len(pre_stage_channels) - 1):
            in_channels = head_channels[i] * head_block.expansion
            out_channels = head_channels[i + 1] * head_block.expansion

            downsamp_module = nn.SequentialCell(
                nn.Conv2d(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=3,
                    stride=2,
                    pad_mode="pad",
                    padding=1,
                ),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(),
            )

            downsamp_modules.append(downsamp_module)
        downsamp_modules = nn.CellList(downsamp_modules)

        final_layer = nn.SequentialCell(
            nn.Conv2d(
                in_channels=head_channels[3] * head_block.expansion,
                out_channels=2048,
                kernel_size=1,
                stride=1,
                padding=0,
            ),
            nn.BatchNorm2d(2048),
            nn.ReLU(),
        )

        return incre_modules, downsamp_modules, final_layer

    def _make_transition_layer(
        self, num_channels_pre_layer: List[int], num_channels_cur_layer: List[int]
    ) -> Tuple[nn.CellList, List[bool]]:
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        transition_layers_flags = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.SequentialCell(
                            nn.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_cur_layer[i],
                                kernel_size=3,
                                padding=1,
                                pad_mode="pad",
                            ),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(),
                        )
                    )
                    transition_layers_flags.append(True)
                else:
                    transition_layers.append(IdentityCell())
                    transition_layers_flags.append(False)
            else:
                conv3x3s = []
                for j in range(i + 1 - num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = (
                        num_channels_cur_layer[i]
                        if j == i - num_branches_pre
                        else inchannels
                    )
                    conv3x3s.append(
                        nn.SequentialCell(
                            [
                                nn.Conv2d(
                                    inchannels,
                                    outchannels,
                                    kernel_size=3,
                                    stride=2,
                                    padding=1,
                                    pad_mode="pad",
                                ),
                                nn.BatchNorm2d(outchannels),
                                nn.ReLU(),
                            ]
                        )
                    )
                transition_layers.append(nn.SequentialCell(conv3x3s))
                transition_layers_flags.append(True)

        return nn.CellList(transition_layers), transition_layers_flags

    def _make_layer(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        in_channels: int,
        out_channels: int,
        blocks: int,
        stride: int = 1,
    ) -> nn.SequentialCell:
        downsample = None
        if stride != 1 or in_channels != out_channels * block.expansion:
            downsample = nn.SequentialCell(
                nn.Conv2d(
                    in_channels,
                    out_channels * block.expansion,
                    kernel_size=1,
                    stride=stride,
                ),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(in_channels, out_channels, stride, down_sample=downsample))
        for _ in range(1, blocks):
            layers.append(block(out_channels * block.expansion, out_channels))

        return nn.SequentialCell(layers)

    def _make_stage(
        self,
        layer_config: Dict[str, int],
        num_inchannels: int,
        multi_scale_output: bool = True,
    ) -> Tuple[nn.SequentialCell, List[int]]:
        num_modules = layer_config["num_modules"]
        num_branches = layer_config["num_branches"]
        num_blocks = layer_config["num_blocks"]
        num_channels = layer_config["num_channels"]
        block = self.blocks_dict[layer_config["block"]]

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(
                HRModule(
                    num_branches,
                    block,
                    num_blocks,
                    num_inchannels,
                    num_channels,
                    reset_multi_scale_output,
                )
            )
            num_inchannels = modules[-1].num_inchannels

        return nn.SequentialCell(modules), num_inchannels

    def forward_features(self, x: Tensor) -> List[Tensor]:
        """Perform the feature extraction.

        Args:
            x: Tensor

        Returns:
            Extracted feature
        """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        # stage 1
        x = self.layer1(x)

        # stage 2
        x_list = []
        for i in range(self.stage2_cfg["num_branches"]):
            if self.transition1_flags[i]:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        # stage 3
        x_list = []
        for i in range(self.stage3_cfg["num_branches"]):
            if self.transition2_flags[i]:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        # stage 4
        x_list = []
        for i in range(self.stage4_cfg["num_branches"]):
            if self.transition3_flags[i]:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y = self.stage4(x_list)

        return y

    def forward_head(self, x: List[Tensor]) -> Tensor:
        y = self.incre_modules[0](x[0])
        for i in range(len(self.downsample_modules)):
            y = self.incre_modules[i + 1](x[i + 1]) + self.downsample_modules[i](y)

        y = self.final_layer(y)
        y = self.pool(y)
        y = self.classifier(y)
        return y

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x
mindcv.models.hrnet.HRNet.forward_features(x)

Perform the feature extraction.

PARAMETER DESCRIPTION
x

Tensor

TYPE: Tensor

RETURNS DESCRIPTION
List[Tensor]

Extracted feature

Source code in mindcv\models\hrnet.py
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
def forward_features(self, x: Tensor) -> List[Tensor]:
    """Perform the feature extraction.

    Args:
        x: Tensor

    Returns:
        Extracted feature
    """
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.conv2(x)
    x = self.bn2(x)
    x = self.relu(x)

    # stage 1
    x = self.layer1(x)

    # stage 2
    x_list = []
    for i in range(self.stage2_cfg["num_branches"]):
        if self.transition1_flags[i]:
            x_list.append(self.transition1[i](x))
        else:
            x_list.append(x)
    y_list = self.stage2(x_list)

    # stage 3
    x_list = []
    for i in range(self.stage3_cfg["num_branches"]):
        if self.transition2_flags[i]:
            x_list.append(self.transition2[i](y_list[-1]))
        else:
            x_list.append(y_list[i])
    y_list = self.stage3(x_list)

    # stage 4
    x_list = []
    for i in range(self.stage4_cfg["num_branches"]):
        if self.transition3_flags[i]:
            x_list.append(self.transition3[i](y_list[-1]))
        else:
            x_list.append(y_list[i])
    y = self.stage4(x_list)

    return y

mindcv.models.hrnet.HRNetFeatures

Bases: HRNet

The feature extraction version of HRNet

Source code in mindcv\models\hrnet.py
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
class HRNetFeatures(HRNet):
    """
    The feature extraction version of HRNet
    """
    def __init__(self, **kwargs) -> None:
        super(HRNetFeatures, self).__init__(**kwargs)
        head_channels = [32, 64, 128, 256]
        curr_stride = 2
        self.feature_info = [dict(chs=64, reduction=curr_stride, name="stem")]

        for i, c in enumerate(head_channels):
            curr_stride *= 2
            c = c * 4
            self.feature_info += [dict(chs=c, reduction=curr_stride, name=f'stage{i + 1}')]

        self.is_rewritten = True

    def construct(self, x: Tensor) -> List[Tensor]:
        out = []

        # stem
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        out.append(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        # stage 1
        x = self.layer1(x)

        # stage 2
        x_list = []
        for i in range(self.stage2_cfg["num_branches"]):
            if self.transition1_flags[i]:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        # stage 3
        x_list = []
        for i in range(self.stage3_cfg["num_branches"]):
            if self.transition2_flags[i]:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        # stage 4
        x_list = []
        for i in range(self.stage4_cfg["num_branches"]):
            if self.transition3_flags[i]:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage4(x_list)

        for f, incre in zip(y_list, self.incre_modules):
            out.append(incre(f))

        return out

mindcv.models.hrnet.IdentityCell

Bases: Cell

Identity Cell

Source code in mindcv\models\hrnet.py
35
36
37
38
39
40
41
42
class IdentityCell(nn.Cell):
    """Identity Cell"""

    def __init__(self) -> None:
        super().__init__()

    def construct(self, x: Any) -> Any:
        return x

mindcv.models.hrnet.hrnet_w32(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get HRNet with width=32 model. Refer to the base class models.HRNet for more details.

PARAMETER DESCRIPTION
pretrained

Whether the model is pretrained. Default: False

TYPE: bool DEFAULT: False

num_classes

number of classification classes. Default: 1000

TYPE: int DEFAULT: 1000

in_channels

Number of input channels. Default: 3

TYPE: int DEFAULT: 3

RETURNS DESCRIPTION
Union[HRNet, HRNetFeatures]

HRNet model

Source code in mindcv\models\hrnet.py
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
@register_model
def hrnet_w32(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> Union[HRNet, HRNetFeatures]:
    """Get HRNet with width=32 model.
    Refer to the base class `models.HRNet` for more details.

    Args:
        pretrained: Whether the model is pretrained. Default: False
        num_classes: number of classification classes. Default: 1000
        in_channels: Number of input channels. Default: 3

    Returns:
        HRNet model
    """
    default_cfg = default_cfgs["hrnet_w32"]
    stage_cfg = dict(
        stage1=dict(
            num_modules=1,
            num_branches=1,
            block="BOTTLENECK",
            num_blocks=[4],
            num_channels=[64],
        ),
        stage2=dict(
            num_modules=1,
            num_branches=2,
            block="BASIC",
            num_blocks=[4, 4],
            num_channels=[32, 64],
        ),
        stage3=dict(
            num_modules=4,
            num_branches=3,
            block="BASIC",
            num_blocks=[4, 4, 4],
            num_channels=[32, 64, 128],
        ),
        stage4=dict(
            num_modules=3,
            num_branches=4,
            block="BASIC",
            num_blocks=[4, 4, 4, 4],
            num_channels=[32, 64, 128, 256],
        ),
    )
    model_args = dict(stage_cfg=stage_cfg, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_hrnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.hrnet.hrnet_w48(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get HRNet with width=48 model. Refer to the base class models.HRNet for more details.

PARAMETER DESCRIPTION
pretrained

Whether the model is pretrained. Default: False

TYPE: bool DEFAULT: False

num_classes

number of classification classes. Default: 1000

TYPE: int DEFAULT: 1000

in_channels

Number of input channels. Default: 3

TYPE: int DEFAULT: 3

RETURNS DESCRIPTION
Union[HRNet, HRNetFeatures]

HRNet model

Source code in mindcv\models\hrnet.py
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
@register_model
def hrnet_w48(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> Union[HRNet, HRNetFeatures]:
    """Get HRNet with width=48 model.
    Refer to the base class `models.HRNet` for more details.

    Args:
        pretrained: Whether the model is pretrained. Default: False
        num_classes: number of classification classes. Default: 1000
        in_channels: Number of input channels. Default: 3

    Returns:
        HRNet model
    """
    default_cfg = default_cfgs["hrnet_w48"]
    stage_cfg = dict(
        stage1=dict(
            num_modules=1,
            num_branches=1,
            block="BOTTLENECK",
            num_blocks=[4],
            num_channels=[64],
        ),
        stage2=dict(
            num_modules=1,
            num_branches=2,
            block="BASIC",
            num_blocks=[4, 4],
            num_channels=[48, 96],
        ),
        stage3=dict(
            num_modules=4,
            num_branches=3,
            block="BASIC",
            num_blocks=[4, 4, 4],
            num_channels=[48, 96, 192],
        ),
        stage4=dict(
            num_modules=3,
            num_branches=4,
            block="BASIC",
            num_blocks=[4, 4, 4, 4],
            num_channels=[48, 96, 192, 384],
        ),
    )
    model_args = dict(stage_cfg=stage_cfg, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_hrnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

inceptionv3

mindcv.models.inceptionv3

MindSpore implementation of InceptionV3. Refer to Rethinking the Inception Architecture for Computer Vision.

mindcv.models.inceptionv3.BasicConv2d

Bases: Cell

A block for conv bn and relu

Source code in mindcv\models\inceptionv3.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class BasicConv2d(nn.Cell):
    """A block for conv bn and relu"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple] = 1,
        stride: int = 1,
        padding: int = 0,
        pad_mode: str = "same",
    ) -> None:
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
                              padding=padding, pad_mode=pad_mode)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.9997)
        self.relu = nn.ReLU()

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

mindcv.models.inceptionv3.InceptionAux

Bases: Cell

Inception module for the aux classifier head

Source code in mindcv\models\inceptionv3.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
class InceptionAux(nn.Cell):
    """Inception module for the aux classifier head"""

    def __init__(
        self,
        in_channels: int,
        num_classes: int,
    ) -> None:
        super().__init__()
        self.avg_pool = nn.AvgPool2d(5, stride=3, pad_mode="valid")
        self.conv0 = BasicConv2d(in_channels, 128, kernel_size=1)
        self.conv1 = BasicConv2d(128, 768, kernel_size=5, pad_mode="valid")
        self.flatten = nn.Flatten()
        self.fc = nn.Dense(in_channels, num_classes)

    def construct(self, x: Tensor) -> Tensor:
        x = self.avg_pool(x)
        x = self.conv0(x)
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

mindcv.models.inceptionv3.InceptionV3

Bases: Cell

Inception v3 model architecture from "Rethinking the Inception Architecture for Computer Vision" <https://arxiv.org/abs/1512.00567>_.

.. note:: Important: In contrast to the other models the inception_v3 expects tensors with a size of N x 3 x 299 x 299, so ensure your images are sized accordingly.

PARAMETER DESCRIPTION
num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

aux_logits

use auxiliary classifier or not. Default: False.

TYPE: bool DEFAULT: True

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

drop_rate

dropout rate of the layer before main classifier. Default: 0.2.

TYPE: float DEFAULT: 0.2

Source code in mindcv\models\inceptionv3.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
class InceptionV3(nn.Cell):
    r"""Inception v3 model architecture from
    `"Rethinking the Inception Architecture for Computer Vision" <https://arxiv.org/abs/1512.00567>`_.

    .. note::
        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
        N x 3 x 299 x 299, so ensure your images are sized accordingly.

    Args:
        num_classes: number of classification classes. Default: 1000.
        aux_logits: use auxiliary classifier or not. Default: False.
        in_channels: number the channels of the input. Default: 3.
        drop_rate: dropout rate of the layer before main classifier. Default: 0.2.
    """

    def __init__(
        self,
        num_classes: int = 1000,
        aux_logits: bool = True,
        in_channels: int = 3,
        drop_rate: float = 0.2,
    ) -> None:
        super().__init__()
        self.aux_logits = aux_logits
        self.conv1a = BasicConv2d(in_channels, 32, kernel_size=3, stride=2, pad_mode="valid")
        self.conv2a = BasicConv2d(32, 32, kernel_size=3, stride=1, pad_mode="valid")
        self.conv2b = BasicConv2d(32, 64, kernel_size=3, stride=1)
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.conv3b = BasicConv2d(64, 80, kernel_size=1)
        self.conv4a = BasicConv2d(80, 192, kernel_size=3, pad_mode="valid")
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.inception5b = InceptionA(192, pool_features=32)
        self.inception5c = InceptionA(256, pool_features=64)
        self.inception5d = InceptionA(288, pool_features=64)
        self.inception6a = InceptionB(288)
        self.inception6b = InceptionC(768, channels_7x7=128)
        self.inception6c = InceptionC(768, channels_7x7=160)
        self.inception6d = InceptionC(768, channels_7x7=160)
        self.inception6e = InceptionC(768, channels_7x7=192)
        if self.aux_logits:
            self.aux = InceptionAux(768, num_classes)
        self.inception7a = InceptionD(768)
        self.inception7b = InceptionE(1280)
        self.inception7c = InceptionE(2048)

        self.pool = GlobalAvgPooling()
        self.dropout = Dropout(p=drop_rate)
        self.num_features = 2048
        self.classifier = nn.Dense(self.num_features, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.XavierUniform(), cell.weight.shape, cell.weight.dtype))

    def forward_preaux(self, x: Tensor) -> Tensor:
        x = self.conv1a(x)
        x = self.conv2a(x)
        x = self.conv2b(x)
        x = self.maxpool1(x)
        x = self.conv3b(x)
        x = self.conv4a(x)
        x = self.maxpool2(x)
        x = self.inception5b(x)
        x = self.inception5c(x)
        x = self.inception5d(x)
        x = self.inception6a(x)
        x = self.inception6b(x)
        x = self.inception6c(x)
        x = self.inception6d(x)
        x = self.inception6e(x)
        return x

    def forward_postaux(self, x: Tensor) -> Tensor:
        x = self.inception7a(x)
        x = self.inception7b(x)
        x = self.inception7c(x)
        return x

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.forward_preaux(x)
        x = self.forward_postaux(x)
        return x

    def construct(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, Tensor]]:
        x = self.forward_preaux(x)
        if self.training and self.aux_logits:
            aux = self.aux(x)
        else:
            aux = None
        x = self.forward_postaux(x)

        x = self.pool(x)
        x = self.dropout(x)
        x = self.classifier(x)

        if self.training and self.aux_logits:
            return x, aux
        return x

mindcv.models.inceptionv3.inception_v3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get InceptionV3 model. Refer to the base class models.InceptionV3 for more details.

Source code in mindcv\models\inceptionv3.py
328
329
330
331
332
333
334
335
336
337
338
@register_model
def inception_v3(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> InceptionV3:
    """Get InceptionV3 model.
    Refer to the base class `models.InceptionV3` for more details."""
    default_cfg = default_cfgs["inception_v3"]
    model = InceptionV3(num_classes=num_classes, aux_logits=True, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

inceptionv4

mindcv.models.inceptionv4

MindSpore implementation of InceptionV4. Refer to Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning.

mindcv.models.inceptionv4.BasicConv2d

Bases: Cell

A block for conv bn and relu

Source code in mindcv\models\inceptionv4.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
class BasicConv2d(nn.Cell):
    """A block for conv bn and relu"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple] = 1,
        stride: int = 1,
        padding: int = 0,
        pad_mode: str = "same",
    ) -> None:
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
                              padding=padding, pad_mode=pad_mode)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.9997)
        self.relu = nn.ReLU()

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

mindcv.models.inceptionv4.InceptionA

Bases: Cell

Inception V4 model basic architecture

Source code in mindcv\models\inceptionv4.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class InceptionA(nn.Cell):
    """Inception V4 model basic architecture"""

    def __init__(self) -> None:
        super().__init__()
        self.branch_0 = BasicConv2d(384, 96, kernel_size=1, stride=1)
        self.branch_1 = nn.SequentialCell([
            BasicConv2d(384, 64, kernel_size=1, stride=1),
            BasicConv2d(64, 96, kernel_size=3, stride=1, pad_mode="pad", padding=1)
        ])
        self.branch_2 = nn.SequentialCell([
            BasicConv2d(384, 64, kernel_size=1, stride=1),
            BasicConv2d(64, 96, kernel_size=3, stride=1, pad_mode="pad", padding=1),
            BasicConv2d(96, 96, kernel_size=3, stride=1, pad_mode="pad", padding=1)
        ])
        self.branch_3 = nn.SequentialCell([
            nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same"),
            BasicConv2d(384, 96, kernel_size=1, stride=1)
        ])

    def construct(self, x: Tensor) -> Tensor:
        x0 = self.branch_0(x)
        x1 = self.branch_1(x)
        x2 = self.branch_2(x)
        x3 = self.branch_3(x)
        x4 = ops.concat((x0, x1, x2, x3), axis=1)
        return x4

mindcv.models.inceptionv4.InceptionB

Bases: Cell

Inception V4 model basic architecture

Source code in mindcv\models\inceptionv4.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class InceptionB(nn.Cell):
    """Inception V4 model basic architecture"""

    def __init__(self) -> None:
        super().__init__()
        self.branch_0 = BasicConv2d(1024, 384, kernel_size=1, stride=1)
        self.branch_1 = nn.SequentialCell([
            BasicConv2d(1024, 192, kernel_size=1, stride=1),
            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1),
            BasicConv2d(224, 256, kernel_size=(7, 1), stride=1),
        ])
        self.branch_2 = nn.SequentialCell([
            BasicConv2d(1024, 192, kernel_size=1, stride=1),
            BasicConv2d(192, 192, kernel_size=(7, 1), stride=1),
            BasicConv2d(192, 224, kernel_size=(1, 7), stride=1),
            BasicConv2d(224, 224, kernel_size=(7, 1), stride=1),
            BasicConv2d(224, 256, kernel_size=(1, 7), stride=1)
        ])
        self.branch_3 = nn.SequentialCell([
            nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same"),
            BasicConv2d(1024, 128, kernel_size=1, stride=1)
        ])

    def construct(self, x: Tensor) -> Tensor:
        x0 = self.branch_0(x)
        x1 = self.branch_1(x)
        x2 = self.branch_2(x)
        x3 = self.branch_3(x)
        x4 = ops.concat((x0, x1, x2, x3), axis=1)
        return x4

mindcv.models.inceptionv4.InceptionC

Bases: Cell

Inception V4 model basic architecture

Source code in mindcv\models\inceptionv4.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class InceptionC(nn.Cell):
    """Inception V4 model basic architecture"""

    def __init__(self) -> None:
        super().__init__()
        self.branch_0 = BasicConv2d(1536, 256, kernel_size=1, stride=1)

        self.branch_1 = BasicConv2d(1536, 384, kernel_size=1, stride=1)
        self.branch_1_1 = BasicConv2d(384, 256, kernel_size=(1, 3), stride=1)
        self.branch_1_2 = BasicConv2d(384, 256, kernel_size=(3, 1), stride=1)

        self.branch_2 = nn.SequentialCell([
            BasicConv2d(1536, 384, kernel_size=1, stride=1),
            BasicConv2d(384, 448, kernel_size=(3, 1), stride=1),
            BasicConv2d(448, 512, kernel_size=(1, 3), stride=1),
        ])
        self.branch_2_1 = BasicConv2d(512, 256, kernel_size=(1, 3), stride=1)
        self.branch_2_2 = BasicConv2d(512, 256, kernel_size=(3, 1), stride=1)

        self.branch_3 = nn.SequentialCell([
            nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same"),
            BasicConv2d(1536, 256, kernel_size=1, stride=1)
        ])

    def construct(self, x: Tensor) -> Tensor:
        x0 = self.branch_0(x)
        x1 = self.branch_1(x)
        x1_1 = self.branch_1_1(x1)
        x1_2 = self.branch_1_2(x1)
        x1 = ops.concat((x1_1, x1_2), axis=1)
        x2 = self.branch_2(x)
        x2_1 = self.branch_2_1(x2)
        x2_2 = self.branch_2_2(x2)
        x2 = ops.concat((x2_1, x2_2), axis=1)
        x3 = self.branch_3(x)
        return ops.concat((x0, x1, x2, x3), axis=1)

mindcv.models.inceptionv4.InceptionV4

Bases: Cell

Inception v4 model architecture from "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" <https://arxiv.org/abs/1602.07261>_. # noqa: E501

PARAMETER DESCRIPTION
num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

drop_rate

dropout rate of the layer before main classifier. Default: 0.2.

TYPE: float DEFAULT: 0.2

Source code in mindcv\models\inceptionv4.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
class InceptionV4(nn.Cell):
    r"""Inception v4 model architecture from
    `"Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" <https://arxiv.org/abs/1602.07261>`_.  # noqa: E501

    Args:
        num_classes: number of classification classes. Default: 1000.
        in_channels: number the channels of the input. Default: 3.
        drop_rate: dropout rate of the layer before main classifier. Default: 0.2.
    """

    def __init__(
        self,
        num_classes: int = 1000,
        in_channels: int = 3,
        drop_rate: float = 0.2,
    ) -> None:
        super().__init__()
        blocks = [Stem(in_channels)]
        for _ in range(4):
            blocks.append(InceptionA())
        blocks.append(ReductionA())
        for _ in range(7):
            blocks.append(InceptionB())
        blocks.append(ReductionB())
        for _ in range(3):
            blocks.append(InceptionC())
        self.features = nn.SequentialCell(blocks)

        self.pool = GlobalAvgPooling()
        self.dropout = Dropout(p=drop_rate)
        self.num_features = 1536
        self.classifier = nn.Dense(self.num_features, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.XavierUniform(), cell.weight.shape, cell.weight.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.inceptionv4.ReductionA

Bases: Cell

Inception V4 model Residual Connections

Source code in mindcv\models\inceptionv4.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
class ReductionA(nn.Cell):
    """Inception V4 model Residual Connections"""

    def __init__(self) -> None:
        super().__init__()
        self.branch_0 = BasicConv2d(384, 384, kernel_size=3, stride=2, pad_mode="valid")
        self.branch_1 = nn.SequentialCell([
            BasicConv2d(384, 192, kernel_size=1, stride=1),
            BasicConv2d(192, 224, kernel_size=3, stride=1, pad_mode="pad", padding=1),
            BasicConv2d(224, 256, kernel_size=3, stride=2, pad_mode="valid"),
        ])
        self.branch_2 = nn.MaxPool2d(3, stride=2)

    def construct(self, x: Tensor) -> Tensor:
        x0 = self.branch_0(x)
        x1 = self.branch_1(x)
        x2 = self.branch_2(x)
        x3 = ops.concat((x0, x1, x2), axis=1)
        return x3

mindcv.models.inceptionv4.ReductionB

Bases: Cell

Inception V4 model Residual Connections

Source code in mindcv\models\inceptionv4.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
class ReductionB(nn.Cell):
    """Inception V4 model Residual Connections"""

    def __init__(self) -> None:
        super().__init__()
        self.branch_0 = nn.SequentialCell([
            BasicConv2d(1024, 192, kernel_size=1, stride=1),
            BasicConv2d(192, 192, kernel_size=3, stride=2, pad_mode="valid"),
        ])
        self.branch_1 = nn.SequentialCell([
            BasicConv2d(1024, 256, kernel_size=1, stride=1),
            BasicConv2d(256, 256, kernel_size=(1, 7), stride=1),
            BasicConv2d(256, 320, kernel_size=(7, 1), stride=1),
            BasicConv2d(320, 320, kernel_size=3, stride=2, pad_mode="valid")
        ])
        self.branch_2 = nn.MaxPool2d(3, stride=2)

    def construct(self, x: Tensor) -> Tensor:
        x0 = self.branch_0(x)
        x1 = self.branch_1(x)
        x2 = self.branch_2(x)
        x3 = ops.concat((x0, x1, x2), axis=1)
        return x3  # 8 x 8 x 1536

mindcv.models.inceptionv4.Stem

Bases: Cell

Inception V4 model blocks.

Source code in mindcv\models\inceptionv4.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class Stem(nn.Cell):
    """Inception V4 model blocks."""

    def __init__(self, in_channels: int) -> None:
        super().__init__()
        self.conv2d_1a_3x3 = BasicConv2d(in_channels, 32, kernel_size=3, stride=2, pad_mode="valid")
        self.conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3, stride=1, pad_mode="valid")
        self.conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, stride=1, pad_mode="pad", padding=1)

        self.mixed_3a_branch_0 = nn.MaxPool2d(3, stride=2)
        self.mixed_3a_branch_1 = BasicConv2d(64, 96, kernel_size=3, stride=2, pad_mode="valid")

        self.mixed_4a_branch_0 = nn.SequentialCell([
            BasicConv2d(160, 64, kernel_size=1, stride=1),
            BasicConv2d(64, 96, kernel_size=3, stride=1, pad_mode="valid")
        ])

        self.mixed_4a_branch_1 = nn.SequentialCell([
            BasicConv2d(160, 64, kernel_size=1, stride=1),
            BasicConv2d(64, 64, kernel_size=(1, 7), stride=1),
            BasicConv2d(64, 64, kernel_size=(7, 1), stride=1),
            BasicConv2d(64, 96, kernel_size=3, stride=1, pad_mode="valid")
        ])

        self.mixed_5a_branch_0 = BasicConv2d(192, 192, kernel_size=3, stride=2, pad_mode="valid")
        self.mixed_5a_branch_1 = nn.MaxPool2d(3, stride=2)

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv2d_1a_3x3(x)  # 149 x 149 x 32
        x = self.conv2d_2a_3x3(x)  # 147 x 147 x 32
        x = self.conv2d_2b_3x3(x)  # 147 x 147 x 64

        x0 = self.mixed_3a_branch_0(x)
        x1 = self.mixed_3a_branch_1(x)
        x = ops.concat((x0, x1), axis=1)  # 73 x 73 x 160

        x0 = self.mixed_4a_branch_0(x)
        x1 = self.mixed_4a_branch_1(x)
        x = ops.concat((x0, x1), axis=1)  # 71 x 71 x 192

        x0 = self.mixed_5a_branch_0(x)
        x1 = self.mixed_5a_branch_1(x)
        x = ops.concat((x0, x1), axis=1)  # 35 x 35 x 384
        return x

mindcv.models.inceptionv4.inception_v4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get InceptionV4 model. Refer to the base class models.InceptionV4 for more details.

Source code in mindcv\models\inceptionv4.py
310
311
312
313
314
315
316
317
318
319
320
@register_model
def inception_v4(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> InceptionV4:
    """Get InceptionV4 model.
    Refer to the base class `models.InceptionV4` for more details."""
    default_cfg = default_cfgs["inception_v4"]
    model = InceptionV4(num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mae

mindcv.models.mae

mindcv.models.mae.MAEForPretrain

Bases: Cell

Source code in mindcv\models\mae.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
class MAEForPretrain(nn.Cell):
    def __init__(
        self,
        image_size: int = 224,
        patch_size: int = 16,
        in_channels: int = 3,
        embed_dim: int = 1024,
        depth: int = 24,
        num_heads: int = 16,
        mlp_ratio: float = 4.,
        decoder_embed_dim: int = 512,
        decoder_depth: int = 8,
        decoder_num_heads: int = 16,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        proj_drop_rate: float = 0.,
        attn_drop_rate: float = 0.,
        drop_path_rate: float = 0.,
        init_values: Optional[float] = None,
        act_layer: nn.Cell = nn.GELU,
        norm_layer: nn.Cell = nn.LayerNorm,
        mlp_layer: Callable = Mlp,
        norm_pix_loss: bool = True,
        mask_ratio: float = 0.75,
        **kwargs,
    ):
        super(MAEForPretrain, self).__init__()
        self.patch_embed = PatchEmbed(image_size=image_size, patch_size=patch_size,
                                      in_chans=in_channels, embed_dim=embed_dim)
        self.num_patches = self.patch_embed.num_patches
        dpr = [x.item() for x in np.linspace(0, drop_path_rate, depth)]
        self.blocks = nn.CellList([
            Block(
                dim=embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm,
                attn_drop=attn_drop_rate, proj_drop=proj_drop_rate,
                mlp_ratio=mlp_ratio, drop_path=dpr[i], init_values=init_values,
                act_layer=act_layer, norm_layer=norm_layer, mlp_layer=mlp_layer,
            ) for i in range(depth)
        ])

        self.cls_token = Parameter(initializer(Normal(sigma=0.02), (1, 1, embed_dim)))

        self.unmask_len = int(np.floor(self.num_patches * (1 - mask_ratio)))

        encoder_pos_emb = Tensor(get_2d_sincos_pos_embed(
            embed_dim, int(self.num_patches ** 0.5), cls_token=True), ms.float32
        )
        encoder_pos_emb = ops.expand_dims(encoder_pos_emb, axis=0)
        self.pos_embed = Parameter(encoder_pos_emb, requires_grad=False)
        self.norm = norm_layer((embed_dim,))

        self.decoder_embed = nn.Dense(embed_dim, decoder_embed_dim)
        self.mask_token = Parameter(initializer(Normal(sigma=0.02), (1, 1, decoder_embed_dim)))

        decoder_pos_emb = Tensor(get_2d_sincos_pos_embed(
            decoder_embed_dim, int(self.num_patches ** 0.5), cls_token=True), ms.float32
        )
        decoder_pos_emb = ops.expand_dims(decoder_pos_emb, axis=0)
        self.decoder_pos_embed = Parameter(decoder_pos_emb, requires_grad=False)

        self.decoder_blocks = nn.CellList([
            Block(
                dim=decoder_embed_dim, num_heads=decoder_num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm,
                attn_drop=attn_drop_rate, proj_drop=proj_drop_rate,
                mlp_ratio=mlp_ratio, drop_path=dpr[i], init_values=init_values,
                act_layer=act_layer, norm_layer=norm_layer, mlp_layer=mlp_layer,
            ) for i in range(decoder_depth)
        ])
        self.decoder_norm = norm_layer((decoder_embed_dim,))
        self.decoder_pred = nn.Dense(decoder_embed_dim, patch_size ** 2 * in_channels)

        self.sort = ops.Sort()

        self.norm_pix_loss = norm_pix_loss
        self._init_weights()

    def _init_weights(self):
        for name, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    initializer("xavier_uniform", cell.weight.shape, cell.weight.dtype)
                )
                if cell.bias is not None:
                    cell.bias.set_data(
                        initializer('zeros', cell.bias.shape, cell.bias.dtype)
                    )

            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(
                    initializer('ones', cell.gamma.shape, cell.gamma.dtype)
                )
                cell.beta.set_data(
                    initializer('zeros', cell.beta.shape, cell.beta.dtype)
                )
            if name == "patch_embed.proj":
                cell.weight.set_data(
                    initializer("xavier_uniform", cell.weight.shape, cell.weight.dtype)
                )

    def patchify(self, imgs):
        """
        imgs: (N, 3, H, W)
        x: (N, L, patch_size ** 2 * 3)
        """
        N, _, H, W = imgs.shape
        p = self.patch_embed.patch_size[0]
        assert H == W and H % p == 0
        h = w = H // p

        x = ops.reshape(imgs, (N, 3, h, p, w, p))
        x = ops.transpose(x, (0, 2, 4, 3, 5, 1))
        x = ops.reshape(x, (N, h * w, p ** 2 * 3))
        return x

    def unpatchify(self, x):
        """
        x: (N, L, patch_size ** 2 * 3)
        imgs: (N, 3, H, W)
        """
        N, L, _ = x.shape
        p = self.patch_embed.patch_size[0]
        h = w = int(L ** 0.5)
        assert h * w == L

        imgs = ops.reshape(x, (N, h, w, p, p, 3))
        imgs = ops.transpose(imgs, (0, 5, 1, 3, 2, 4))
        imgs = ops.reshape(imgs, (N, 3, h * p, w * p))
        return imgs

    def apply_masking(self, x, mask):
        D = x.shape[2]
        _, ids_shuffle = self.sort(mask.astype(ms.float32))
        _, ids_restore = self.sort(ids_shuffle.astype(ms.float32))

        ids_keep = ids_shuffle[:, :self.unmask_len]
        ids_keep = ops.broadcast_to(ops.expand_dims(ids_keep, axis=-1), (-1, -1, D))
        x_unmasked = ops.gather_elements(x, dim=1, index=ids_keep)

        return x_unmasked, ids_restore

    def forward_features(self, x, mask):
        x = self.patch_embed(x)
        bsz = x.shape[0]

        x = x + self.pos_embed[:, 1:, :]
        x, ids_restore = self.apply_masking(x, mask)

        cls_token = self.cls_token + self.pos_embed[:, :1, :]
        cls_token = ops.broadcast_to(cls_token, (bsz, -1, -1))
        cls_token = cls_token.astype(x.dtype)
        x = ops.concat((cls_token, x), axis=1)

        for blk in self.blocks:
            x = blk(x)

        x = self.norm(x)
        return x, ids_restore

    def forward_decoder(self, x, ids_restore):
        x = self.decoder_embed(x)
        bsz, L, D = x.shape

        mask_len = self.num_patches + 1 - L
        mask_tokens = ops.broadcast_to(self.mask_token, (bsz, mask_len, -1))
        mask_tokens = mask_tokens.astype(x.dtype)

        x_ = ops.concat((x[:, 1:, :], mask_tokens), axis=1)
        ids_restore = ops.broadcast_to(ops.expand_dims(ids_restore, axis=-1), (-1, -1, D))
        x_ = ops.gather_elements(x_, dim=1, index=ids_restore)
        x = ops.concat((x[:, :1, :], x_), axis=1)

        x = x + self.decoder_pos_embed

        for blk in self.decoder_blocks:
            x = blk(x)

        x = self.decoder_norm(x)
        x = self.decoder_pred(x)

        return x[:, 1:, :]

    def forward_loss(self, imgs, pred, mask):
        target = self.patchify(imgs)
        if self.norm_pix_loss:
            mean = target.mean(axis=-1, keep_dims=True)
            std = target.std(axis=-1, keepdims=True)
            target = (target - mean) / std

        loss = (pred - target) ** 2
        loss = loss.mean(axis=-1)

        mask = mask.astype(loss.dtype)
        loss = (loss * mask).sum() / mask.sum()
        return loss

    def construct(self, imgs, mask):
        bsz = imgs.shape[0]
        mask = ops.reshape(mask, (bsz, -1))
        features, ids_restore = self.forward_features(imgs, mask)
        pred = self.forward_decoder(features, ids_restore)
        loss = self.forward_loss(imgs, pred, mask)
        return loss

    def get_num_layers(self):
        return len(self.blocks)

    def no_weight_decay(self):
        return {'pos_embed', 'cls_token'}
mindcv.models.mae.MAEForPretrain.patchify(imgs)
Source code in mindcv\models\mae.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def patchify(self, imgs):
    """
    imgs: (N, 3, H, W)
    x: (N, L, patch_size ** 2 * 3)
    """
    N, _, H, W = imgs.shape
    p = self.patch_embed.patch_size[0]
    assert H == W and H % p == 0
    h = w = H // p

    x = ops.reshape(imgs, (N, 3, h, p, w, p))
    x = ops.transpose(x, (0, 2, 4, 3, 5, 1))
    x = ops.reshape(x, (N, h * w, p ** 2 * 3))
    return x
mindcv.models.mae.MAEForPretrain.unpatchify(x)
Source code in mindcv\models\mae.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def unpatchify(self, x):
    """
    x: (N, L, patch_size ** 2 * 3)
    imgs: (N, 3, H, W)
    """
    N, L, _ = x.shape
    p = self.patch_embed.patch_size[0]
    h = w = int(L ** 0.5)
    assert h * w == L

    imgs = ops.reshape(x, (N, h, w, p, p, 3))
    imgs = ops.transpose(imgs, (0, 5, 1, 3, 2, 4))
    imgs = ops.reshape(imgs, (N, 3, h * p, w * p))
    return imgs

mindcv.models.mae.get_1d_sincos_pos_embed_from_grid(embed_dim, pos)

Source code in mindcv\models\mae.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float32)
    omega /= embed_dim / 2.
    omega = 1. / 10000 ** omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb

mindcv.models.mae.get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False)

Source code in mindcv\models\mae.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = np.arange(grid_size, dtype=np.float32)
    grid_w = np.arange(grid_size, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size, grid_size])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token:
        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
    return pos_embed

mixnet

mindcv.models.mixnet

MindSpore implementation of MixNet. Refer to MixConv: Mixed Depthwise Convolutional Kernels

mindcv.models.mixnet.MDConv

Bases: Cell

Mixed Depth-wise Convolution

Source code in mindcv\models\mixnet.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
class MDConv(nn.Cell):
    """Mixed Depth-wise Convolution"""

    def __init__(self, channels: int, kernel_size: list, stride: int) -> None:
        super(MDConv, self).__init__()
        self.num_groups = len(kernel_size)

        if self.num_groups == 1:
            self.mixed_depthwise_conv = nn.Conv2d(
                channels,
                channels,
                kernel_size[0],
                stride=stride,
                pad_mode="pad",
                padding=kernel_size[0] // 2,
                group=channels,
                has_bias=False
            )
        else:
            self.split_channels = _splitchannels(channels, self.num_groups)

            self.mixed_depthwise_conv = nn.CellList()
            for i in range(self.num_groups):
                self.mixed_depthwise_conv.append(nn.Conv2d(
                    self.split_channels[i],
                    self.split_channels[i],
                    kernel_size[i],
                    stride=stride,
                    pad_mode="pad",
                    padding=kernel_size[i] // 2,
                    group=self.split_channels[i],
                    has_bias=False
                ))

    def construct(self, x: Tensor) -> Tensor:
        if self.num_groups == 1:
            return self.mixed_depthwise_conv(x)

        output = []
        start, end = 0, 0
        for i in range(self.num_groups):
            start, end = end, end + self.split_channels[i]
            x_split = x[:, start:end]

            conv = self.mixed_depthwise_conv[i]
            output.append(conv(x_split))

        return ops.concat(output, axis=1)

mindcv.models.mixnet.MixNet

Bases: Cell

MixNet model class, based on "MixConv: Mixed Depthwise Convolutional Kernels" <https://arxiv.org/abs/1907.09595>_

PARAMETER DESCRIPTION
arch

size of the architecture. "small", "medium" or "large". Default: "small".

TYPE: str DEFAULT: 'small'

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number of the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

feature_size

numbet of the channels of the output features. Default: 1536.

TYPE: int DEFAULT: 1536

drop_rate

rate of dropout for classifier. Default: 0.2.

TYPE: float DEFAULT: 0.2

depth_multiplier

expansion coefficient of channels. Default: 1.0.

TYPE: float DEFAULT: 1.0

Source code in mindcv\models\mixnet.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
class MixNet(nn.Cell):
    r"""MixNet model class, based on
    `"MixConv: Mixed Depthwise Convolutional Kernels" <https://arxiv.org/abs/1907.09595>`_

    Args:
        arch: size of the architecture. "small", "medium" or "large". Default: "small".
        num_classes: number of classification classes. Default: 1000.
        in_channels: number of the channels of the input. Default: 3.
        feature_size: numbet of the channels of the output features. Default: 1536.
        drop_rate: rate of dropout for classifier. Default: 0.2.
        depth_multiplier: expansion coefficient of channels. Default: 1.0.
    """

    def __init__(
        self,
        arch: str = "small",
        num_classes: int = 1000,
        in_channels: int = 3,
        feature_size: int = 1536,
        drop_rate: float = 0.2,
        depth_multiplier: float = 1.0
    ) -> None:
        super(MixNet, self).__init__()
        if arch == "small":
            block_configs = [
                [16, 16, [3], [1], [1], 1, 1, "ReLU", 0.0],
                [16, 24, [3], [1, 1], [1, 1], 2, 6, "ReLU", 0.0],
                [24, 24, [3], [1, 1], [1, 1], 1, 3, "ReLU", 0.0],
                [24, 40, [3, 5, 7], [1], [1], 2, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 80, [3, 5, 7], [1], [1, 1], 2, 6, "Swish", 0.25],
                [80, 80, [3, 5], [1], [1, 1], 1, 6, "Swish", 0.25],
                [80, 80, [3, 5], [1], [1, 1], 1, 6, "Swish", 0.25],
                [80, 120, [3, 5, 7], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [120, 120, [3, 5, 7, 9], [1, 1], [1, 1], 1, 3, "Swish", 0.5],
                [120, 120, [3, 5, 7, 9], [1, 1], [1, 1], 1, 3, "Swish", 0.5],
                [120, 200, [3, 5, 7, 9, 11], [1], [1], 2, 6, "Swish", 0.5],
                [200, 200, [3, 5, 7, 9], [1], [1, 1], 1, 6, "Swish", 0.5],
                [200, 200, [3, 5, 7, 9], [1], [1, 1], 1, 6, "Swish", 0.5]
            ]
            stem_channels = 16
            drop_rate = drop_rate
        else:
            block_configs = [
                [24, 24, [3], [1], [1], 1, 1, "ReLU", 0.0],
                [24, 32, [3, 5, 7], [1, 1], [1, 1], 2, 6, "ReLU", 0.0],
                [32, 32, [3], [1, 1], [1, 1], 1, 3, "ReLU", 0.0],
                [32, 40, [3, 5, 7, 9], [1], [1], 2, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 40, [3, 5], [1, 1], [1, 1], 1, 6, "Swish", 0.5],
                [40, 80, [3, 5, 7], [1], [1], 2, 6, "Swish", 0.25],
                [80, 80, [3, 5, 7, 9], [1, 1], [1, 1], 1, 6, "Swish", 0.25],
                [80, 80, [3, 5, 7, 9], [1, 1], [1, 1], 1, 6, "Swish", 0.25],
                [80, 80, [3, 5, 7, 9], [1, 1], [1, 1], 1, 6, "Swish", 0.25],
                [80, 120, [3], [1], [1], 1, 6, "Swish", 0.5],
                [120, 120, [3, 5, 7, 9], [1, 1], [1, 1], 1, 3, "Swish", 0.5],
                [120, 120, [3, 5, 7, 9], [1, 1], [1, 1], 1, 3, "Swish", 0.5],
                [120, 120, [3, 5, 7, 9], [1, 1], [1, 1], 1, 3, "Swish", 0.5],
                [120, 200, [3, 5, 7, 9], [1], [1], 2, 6, "Swish", 0.5],
                [200, 200, [3, 5, 7, 9], [1], [1, 1], 1, 6, "Swish", 0.5],
                [200, 200, [3, 5, 7, 9], [1], [1, 1], 1, 6, "Swish", 0.5],
                [200, 200, [3, 5, 7, 9], [1], [1, 1], 1, 6, "Swish", 0.5]
            ]
            if arch == "medium":
                stem_channels = 24
                drop_rate = drop_rate
            elif arch == "large":
                stem_channels = 24
                depth_multiplier *= 1.3
                drop_rate = drop_rate
            else:
                raise ValueError(f"Unsupported model type {arch}")

        if depth_multiplier != 1.0:
            stem_channels = _roundchannels(stem_channels * depth_multiplier)

            for i, conf in enumerate(block_configs):
                conf_ls = list(conf)
                conf_ls[0] = _roundchannels(conf_ls[0] * depth_multiplier)
                conf_ls[1] = _roundchannels(conf_ls[1] * depth_multiplier)
                block_configs[i] = tuple(conf_ls)

        # stem convolution
        self.stem_conv = nn.SequentialCell([
            nn.Conv2d(in_channels, stem_channels, 3, stride=2, pad_mode="pad", padding=1),
            nn.BatchNorm2d(stem_channels),
            nn.ReLU()
        ])

        # building MixNet blocks
        layers = []
        for inc, outc, k, ek, pk, s, er, ac, se in block_configs:
            layers.append(MixNetBlock(
                inc,
                outc,
                kernel_size=k,
                expand_ksize=ek,
                project_ksize=pk,
                stride=s,
                expand_ratio=er,
                activation=ac,
                se_ratio=se
            ))
        self.layers = nn.SequentialCell(layers)

        # head
        self.head_conv = nn.SequentialCell([
            nn.Conv2d(block_configs[-1][1], feature_size, 1, pad_mode="pad", padding=0),
            nn.BatchNorm2d(feature_size),
            nn.ReLU()
        ])

        self.pool = GlobalAvgPooling()
        self.dropout = Dropout(p=drop_rate)
        self.classifier = nn.Dense(feature_size, num_classes)

        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                fan_out = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                cell.weight.set_data(
                    init.initializer(init.Normal(math.sqrt(2.0 / fan_out)),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Uniform(1.0 / math.sqrt(cell.weight.shape[0])),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.stem_conv(x)
        x = self.layers(x)
        x = self.head_conv(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.mixnet.MixNetBlock

Bases: Cell

Basic Block of MixNet

Source code in mindcv\models\mixnet.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class MixNetBlock(nn.Cell):
    """Basic Block of MixNet"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: list = [3],
        expand_ksize: list = [1],
        project_ksize: list = [1],
        stride: int = 1,
        expand_ratio: int = 1,
        activation: str = "ReLU",
        se_ratio: float = 0.0,
    ) -> None:
        super(MixNetBlock, self).__init__()
        assert activation in ["ReLU", "Swish"]
        self.activation = Swish if activation == "Swish" else nn.ReLU

        expand_channels = in_channels * expand_ratio
        self.residual_connection = (stride == 1 and in_channels == out_channels)

        conv = []
        if expand_ratio != 1:
            # expand
            conv.extend([
                GroupedConv2d(in_channels, expand_channels, expand_ksize),
                nn.BatchNorm2d(expand_channels),
                self.activation()
            ])

        # depthwise
        conv.extend([
            MDConv(expand_channels, kernel_size, stride),
            nn.BatchNorm2d(expand_channels),
            self.activation()
        ])

        if se_ratio > 0:
            squeeze_channels = int(in_channels * se_ratio)
            squeeze_excite = SqueezeExcite(expand_channels, rd_channels=squeeze_channels)
            conv.append(squeeze_excite)

        # projection phase
        conv.extend([
            GroupedConv2d(expand_channels, out_channels, project_ksize),
            nn.BatchNorm2d(out_channels)
        ])

        self.convs = nn.SequentialCell(conv)

    def construct(self, x: Tensor) -> Tensor:
        if self.residual_connection:
            return x + self.convs(x)
        else:
            return self.convs(x)

mlpmixer

mindcv.models.mlpmixer

MindSpore implementation of MLP-Mixer. Refer to MLP-Mixer: An all-MLP Architecture for Vision.

mindcv.models.mlpmixer.FeedForward

Bases: Cell

Feed Forward Block. MLP Layer. FC -> GELU -> FC

Source code in mindcv\models\mlpmixer.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
class FeedForward(nn.Cell):
    """Feed Forward Block. MLP Layer. FC -> GELU -> FC"""

    def __init__(self, dim, hidden_dim, dropout=0.):
        super(FeedForward, self).__init__()
        self.net = nn.SequentialCell(
            nn.Dense(dim, hidden_dim),
            nn.GELU(),
            Dropout(p=dropout),
            nn.Dense(hidden_dim, dim),
            Dropout(p=dropout)
        )

    def construct(self, x):
        return self.net(x)

mindcv.models.mlpmixer.MLPMixer

Bases: Cell

MLP-Mixer model class, based on "MLP-Mixer: An all-MLP Architecture for Vision" <https://arxiv.org/abs/2105.01601>_

PARAMETER DESCRIPTION
depth

number of MixerBlocks.

TYPE: int)

patch_size

size of a single image patch.

TYPE: int or tuple)

n_patches

number of patches.

TYPE: int)

n_channels

channels(dimension) of a single embedded patch.

TYPE: int)

token_dim

hidden dim of token-mixing MLP.

TYPE: int)

channel_dim

hidden dim of channel-mixing MLP.

TYPE: int)

num_classes

number of classification classes.

TYPE: int) DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

DEFAULT: 3

Source code in mindcv\models\mlpmixer.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class MLPMixer(nn.Cell):
    r"""MLP-Mixer model class, based on
    `"MLP-Mixer: An all-MLP Architecture for Vision" <https://arxiv.org/abs/2105.01601>`_

    Args:
        depth (int) : number of MixerBlocks.
        patch_size (int or tuple) : size of a single image patch.
        n_patches (int) : number of patches.
        n_channels (int) : channels(dimension) of a single embedded patch.
        token_dim (int) : hidden dim of token-mixing MLP.
        channel_dim (int) : hidden dim of channel-mixing MLP.
        num_classes (int) : number of classification classes.
        in_channels: number the channels of the input. Default: 3.
    """

    def __init__(self, depth, patch_size, n_patches, n_channels, token_dim, channel_dim, num_classes=1000,
                 in_channels=3):
        super().__init__()
        self.n_patches = n_patches
        self.n_channels = n_channels
        # patch with shape of (3, patch_size, patch_size) is embedded to n_channels dim feature.
        self.to_patch_embedding = nn.SequentialCell(
            nn.Conv2d(in_channels, n_channels, patch_size, patch_size, pad_mode="pad", padding=0),
            TransPose(permutation=(0, 2, 1), embedding=True),
        )
        self.mixer_blocks = nn.SequentialCell()
        for _ in range(depth):
            self.mixer_blocks.append(MixerBlock(n_patches, n_channels, token_dim, channel_dim))
        self.layer_norm = nn.LayerNorm((n_channels,))
        self.mlp_head = nn.Dense(n_channels, num_classes)
        self.mean = ops.ReduceMean()
        self._initialize_weights()

    def construct(self, x):
        x = self.to_patch_embedding(x)
        x = self.mixer_blocks(x)
        x = self.layer_norm(x)
        x = self.mean(x, 1)
        return self.mlp_head(x)

    def _initialize_weights(self):
        # todo: implement weights init
        pass

mindcv.models.mlpmixer.MixerBlock

Bases: Cell

Mixer Layer with token-mixing MLP and channel-mixing MLP

Source code in mindcv\models\mlpmixer.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class MixerBlock(nn.Cell):
    """Mixer Layer with token-mixing MLP and channel-mixing MLP"""

    def __init__(self, n_patches, n_channels, token_dim, channel_dim, dropout=0.):
        super().__init__()
        self.token_mix = nn.SequentialCell(
            nn.LayerNorm((n_channels,)),
            TransPose((0, 2, 1)),
            FeedForward(n_patches, token_dim, dropout),
            TransPose((0, 2, 1))
        )
        self.channel_mix = nn.SequentialCell(
            nn.LayerNorm((n_channels,)),
            FeedForward(n_channels, channel_dim, dropout),
        )

    def construct(self, x):
        x = x + self.token_mix(x)
        x = x + self.channel_mix(x)
        return x

mindcv.models.mlpmixer.TransPose

Bases: Cell

TransPose Layer. Wrap operator Transpose for easy integration in nn.SequentialCell

Source code in mindcv\models\mlpmixer.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class TransPose(nn.Cell):
    """TransPose Layer. Wrap operator Transpose for easy integration in nn.SequentialCell"""

    def __init__(self, permutation=(0, 2, 1), embedding=False):
        super(TransPose, self).__init__()
        self.permutation = permutation
        self.embedding = embedding
        if embedding:
            self.reshape = ops.Reshape()
        self.transpose = ops.Transpose()

    def construct(self, x):
        if self.embedding:
            b, c, h, w = x.shape
            x = self.reshape(x, (b, c, h * w))
        x = self.transpose(x, self.permutation)
        return x

mnasnet

mindcv.models.mnasnet

MindSpore implementation of MnasNet. Refer to MnasNet: Platform-Aware Neural Architecture Search for Mobile.

mindcv.models.mnasnet.Mnasnet

Bases: Cell

MnasNet model architecture from "MnasNet: Platform-Aware Neural Architecture Search for Mobile" <https://arxiv.org/abs/1807.11626>_.

PARAMETER DESCRIPTION
alpha

scale factor of model width.

TYPE: float

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

drop_rate

dropout rate of the layer before main classifier. Default: 0.2.

TYPE: float DEFAULT: 0.2

Source code in mindcv\models\mnasnet.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class Mnasnet(nn.Cell):
    r"""MnasNet model architecture from
    `"MnasNet: Platform-Aware Neural Architecture Search for Mobile" <https://arxiv.org/abs/1807.11626>`_.

    Args:
        alpha: scale factor of model width.
        in_channels: number the channels of the input. Default: 3.
        num_classes: number of classification classes. Default: 1000.
        drop_rate: dropout rate of the layer before main classifier. Default: 0.2.
    """

    def __init__(
        self,
        alpha: float,
        in_channels: int = 3,
        num_classes: int = 1000,
        drop_rate: float = 0.2,
    ):
        super().__init__()

        inverted_residual_setting = [
            # t, c, n, s, k
            [3, 24, 3, 2, 3],  # -> 56x56
            [3, 40, 3, 2, 5],  # -> 28x28
            [6, 80, 3, 2, 5],  # -> 14x14
            [6, 96, 2, 1, 3],  # -> 14x14
            [6, 192, 4, 2, 5],  # -> 7x7
            [6, 320, 1, 1, 3],  # -> 7x7
        ]

        mid_channels = make_divisible(32 * alpha, 8)
        input_channels = make_divisible(16 * alpha, 8)

        features: List[nn.Cell] = [
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=2, pad_mode="pad", padding=1),
            nn.BatchNorm2d(mid_channels, momentum=0.99, eps=1e-3),
            nn.ReLU(),
            nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=1, pad_mode="pad", padding=1,
                      group=mid_channels),
            nn.BatchNorm2d(mid_channels, momentum=0.99, eps=1e-3),
            nn.ReLU(),
            nn.Conv2d(mid_channels, input_channels, kernel_size=1, stride=1),
            nn.BatchNorm2d(input_channels, momentum=0.99, eps=1e-3),
        ]

        for t, c, n, s, k in inverted_residual_setting:
            output_channels = make_divisible(c * alpha, 8)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(InvertedResidual(input_channels, output_channels,
                                                 stride=stride, kernel_size=k, expand_ratio=t))
                input_channels = output_channels

        features.extend([
            nn.Conv2d(input_channels, 1280, kernel_size=1, stride=1),
            nn.BatchNorm2d(1280, momentum=0.99, eps=1e-3),
            nn.ReLU(),
        ])
        self.features = nn.SequentialCell(features)
        self.pool = GlobalAvgPooling()
        self.dropout = Dropout(p=drop_rate)
        self.classifier = nn.Dense(1280, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(mode="fan_out", nonlinearity="sigmoid"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.mnasnet.mnasnet_050(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MnasNet model with width scaled by 0.5. Refer to the base class models.Mnasnet for more details.

Source code in mindcv\models\mnasnet.py
180
181
182
183
184
185
186
187
188
189
190
@register_model
def mnasnet_050(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> Mnasnet:
    """Get MnasNet model with width scaled by 0.5.
    Refer to the base class `models.Mnasnet` for more details."""
    default_cfg = default_cfgs["mnasnet_050"]
    model = Mnasnet(alpha=0.5, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mnasnet.mnasnet_075(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MnasNet model with width scaled by 0.75. Refer to the base class models.Mnasnet for more details.

Source code in mindcv\models\mnasnet.py
193
194
195
196
197
198
199
200
201
202
203
@register_model
def mnasnet_075(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> Mnasnet:
    """Get MnasNet model with width scaled by 0.75.
    Refer to the base class `models.Mnasnet` for more details."""
    default_cfg = default_cfgs["mnasnet_075"]
    model = Mnasnet(alpha=0.75, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mnasnet.mnasnet_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MnasNet model with width scaled by 1.0. Refer to the base class models.Mnasnet for more details.

Source code in mindcv\models\mnasnet.py
206
207
208
209
210
211
212
213
214
215
216
@register_model
def mnasnet_100(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> Mnasnet:
    """Get MnasNet model with width scaled by 1.0.
    Refer to the base class `models.Mnasnet` for more details."""
    default_cfg = default_cfgs["mnasnet_100"]
    model = Mnasnet(alpha=1.0, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mnasnet.mnasnet_130(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MnasNet model with width scaled by 1.3. Refer to the base class models.Mnasnet for more details.

Source code in mindcv\models\mnasnet.py
219
220
221
222
223
224
225
226
227
228
229
@register_model
def mnasnet_130(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> Mnasnet:
    """Get MnasNet model with width scaled by 1.3.
    Refer to the base class `models.Mnasnet` for more details."""
    default_cfg = default_cfgs["mnasnet_130"]
    model = Mnasnet(alpha=1.3, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mnasnet.mnasnet_140(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MnasNet model with width scaled by 1.4. Refer to the base class models.Mnasnet for more details.

Source code in mindcv\models\mnasnet.py
232
233
234
235
236
237
238
239
240
241
242
@register_model
def mnasnet_140(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> Mnasnet:
    """Get MnasNet model with width scaled by 1.4.
    Refer to the base class `models.Mnasnet` for more details."""
    default_cfg = default_cfgs["mnasnet_140"]
    model = Mnasnet(alpha=1.4, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mobilenetv1

mindcv.models.mobilenetv1

MindSpore implementation of MobileNetV1. Refer to MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications.

mindcv.models.mobilenetv1.MobileNetV1

Bases: Cell

MobileNetV1 model class, based on "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>_ # noqa: E501

PARAMETER DESCRIPTION
alpha

scale factor of model width. Default: 1.

TYPE: float DEFAULT: 1.0

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\mobilenetv1.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class MobileNetV1(nn.Cell):
    r"""MobileNetV1 model class, based on
    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_  # noqa: E501

    Args:
        alpha: scale factor of model width. Default: 1.
        in_channels: number the channels of the input. Default: 3.
        num_classes: number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        alpha: float = 1.0,
        in_channels: int = 3,
        num_classes: int = 1000,
    ) -> None:
        super().__init__()
        input_channels = int(32 * alpha)
        # Setting of depth-wise separable conv
        # c: number of output channel
        # s: stride of depth-wise conv
        block_setting = [
            # c, s
            [64, 1],
            [128, 2],
            [128, 1],
            [256, 2],
            [256, 1],
            [512, 2],
            [512, 1],
            [512, 1],
            [512, 1],
            [512, 1],
            [512, 1],
            [1024, 2],
            [1024, 1],
        ]

        features = [
            nn.Conv2d(in_channels, input_channels, 3, 2, pad_mode="pad", padding=1, has_bias=False),
            nn.BatchNorm2d(input_channels),
            nn.ReLU(),
        ]
        for c, s in block_setting:
            output_channel = int(c * alpha)
            features.append(depthwise_separable_conv(input_channels, output_channel, s))
            input_channels = output_channel
        self.features = nn.SequentialCell(features)

        self.pool = GlobalAvgPooling()
        self.classifier = nn.Dense(input_channels, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(init.initializer(init.XavierUniform(), cell.weight.shape, cell.weight.dtype))
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.TruncatedNormal(), cell.weight.shape, cell.weight.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.mobilenetv1.mobilenet_v1_025(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV1 model with width scaled by 0.25. Refer to the base class models.MobileNetV1 for more details.

Source code in mindcv\models\mobilenetv1.py
137
138
139
140
141
142
143
144
145
146
147
148
@register_model
def mobilenet_v1_025(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV1:
    """Get MobileNetV1 model with width scaled by 0.25.
    Refer to the base class `models.MobileNetV1` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v1_025"]
    model = MobileNetV1(alpha=0.25, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mobilenetv1.mobilenet_v1_050(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV1 model with width scaled by 0.5. Refer to the base class models.MobileNetV1 for more details.

Source code in mindcv\models\mobilenetv1.py
151
152
153
154
155
156
157
158
159
160
161
162
@register_model
def mobilenet_v1_050(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV1:
    """Get MobileNetV1 model with width scaled by 0.5.
    Refer to the base class `models.MobileNetV1` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v1_050"]
    model = MobileNetV1(alpha=0.5, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mobilenetv1.mobilenet_v1_075(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV1 model with width scaled by 0.75. Refer to the base class models.MobileNetV1 for more details.

Source code in mindcv\models\mobilenetv1.py
165
166
167
168
169
170
171
172
173
174
175
176
@register_model
def mobilenet_v1_075(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV1:
    """Get MobileNetV1 model with width scaled by 0.75.
    Refer to the base class `models.MobileNetV1` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v1_075"]
    model = MobileNetV1(alpha=0.75, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.mobilenetv1.mobilenet_v1_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV1 model without width scaling. Refer to the base class models.MobileNetV1 for more details.

Source code in mindcv\models\mobilenetv1.py
179
180
181
182
183
184
185
186
187
188
189
190
@register_model
def mobilenet_v1_100(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV1:
    """Get MobileNetV1 model without width scaling.
    Refer to the base class `models.MobileNetV1` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v1_100"]
    model = MobileNetV1(alpha=1.0, in_channels=in_channels, num_classes=num_classes, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mobilenetv2

mindcv.models.mobilenetv2

MindSpore implementation of MobileNetV2. Refer to MobileNetV2: Inverted Residuals and Linear Bottlenecks.

mindcv.models.mobilenetv2.InvertedResidual

Bases: Cell

Inverted Residual Block of MobileNetV2

Source code in mindcv\models\mobilenetv2.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class InvertedResidual(nn.Cell):
    """Inverted Residual Block of MobileNetV2"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int,
        expand_ratio: int,
    ) -> None:
        super().__init__()
        assert stride in [1, 2]
        hidden_dim = round(in_channels * expand_ratio)
        self.use_res_connect = stride == 1 and in_channels == out_channels

        layers = []
        if expand_ratio != 1:
            # pw
            layers.extend([
                nn.Conv2d(in_channels, hidden_dim, 1, 1, pad_mode="pad", padding=0, has_bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6()
            ])
        layers.extend([
            # dw
            nn.Conv2d(hidden_dim, hidden_dim, 3, stride, pad_mode="pad", padding=1, group=hidden_dim, has_bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(),
            # pw-linear
            nn.Conv2d(hidden_dim, out_channels, 1, 1, pad_mode="pad", padding=0, has_bias=False),
            nn.BatchNorm2d(out_channels),
        ])
        self.layers = nn.SequentialCell(layers)

    def construct(self, x: Tensor) -> Tensor:
        if self.use_res_connect:
            return x + self.layers(x)
        return self.layers(x)

mindcv.models.mobilenetv2.MobileNetV2

Bases: Cell

MobileNetV2 model class, based on "MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>_

PARAMETER DESCRIPTION
alpha

scale factor of model width. Default: 1.

TYPE: float DEFAULT: 1.0

round_nearest

divisor of make divisible function. Default: 8.

TYPE: int DEFAULT: 8

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\mobilenetv2.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
class MobileNetV2(nn.Cell):
    r"""MobileNetV2 model class, based on
    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_

    Args:
        alpha: scale factor of model width. Default: 1.
        round_nearest: divisor of make divisible function. Default: 8.
        in_channels: number the channels of the input. Default: 3.
        num_classes: number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        alpha: float = 1.0,
        round_nearest: int = 8,
        in_channels: int = 3,
        num_classes: int = 1000,
    ) -> None:
        super().__init__()
        input_channels = make_divisible(32 * alpha, round_nearest)
        # Setting of inverted residual blocks.
        # t: The expansion factor.
        # c: Number of output channel.
        # n: Number of block.
        # s: First block stride.
        inverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]
        last_channels = make_divisible(1280 * max(1.0, alpha), round_nearest)

        # Building stem conv layer.
        features = [
            nn.Conv2d(in_channels, input_channels, 3, 2, pad_mode="pad", padding=1, has_bias=False),
            nn.BatchNorm2d(input_channels),
            nn.ReLU6(),
        ]

        total_reduction = 2
        self.feature_info = []
        self.flatten_sequential = True
        self.feature_info.append(dict(chs=input_channels, reduction=total_reduction,
                                      name=f'features.{len(features) - 1}'))

        # Building inverted residual blocks.
        for t, c, n, s in inverted_residual_setting:
            output_channel = make_divisible(c * alpha, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(InvertedResidual(input_channels, output_channel, stride, expand_ratio=t))
                input_channels = output_channel

                total_reduction *= stride
                self.feature_info.append(dict(chs=output_channel, reduction=total_reduction,
                                              name=f'features.{len(features) - 1}'))

        # Building last point-wise layers.
        features.extend([
            nn.Conv2d(input_channels, last_channels, 1, 1, pad_mode="pad", padding=0, has_bias=False),
            nn.BatchNorm2d(last_channels),
            nn.ReLU6(),
        ])

        self.feature_info.append(dict(chs=last_channels, reduction=total_reduction,
                                      name=f'features.{len(features) - 1}'))

        self.features = nn.SequentialCell(features)

        self.pool = GlobalAvgPooling()
        self.classifier = nn.SequentialCell([
            Dropout(p=0.2),  # confirmed by paper authors
            nn.Dense(last_channels, num_classes),
        ])
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                n = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=math.sqrt(2. / n), mean=0.0),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=0.01, mean=0.0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.mobilenetv2.mobilenet_v2_035_128(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.35 and input image size of 128. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
482
483
484
485
486
487
488
489
@register_model
def mobilenet_v2_035_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.35 and input image size of 128.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_035_128"]
    model_args = dict(alpha=0.35, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_035_160(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.35 and input image size of 160. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
472
473
474
475
476
477
478
479
@register_model
def mobilenet_v2_035_160(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.35 and input image size of 160.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_035_160"]
    model_args = dict(alpha=0.35, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_035_192(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.35 and input image size of 192. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
462
463
464
465
466
467
468
469
@register_model
def mobilenet_v2_035_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.35 and input image size of 192.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_035_192"]
    model_args = dict(alpha=0.35, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_035_224(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.35 and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
452
453
454
455
456
457
458
459
@register_model
def mobilenet_v2_035_224(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.35 and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_035_224"]
    model_args = dict(alpha=0.35, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_035_96(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.35 and input image size of 96. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
492
493
494
495
496
497
498
499
@register_model
def mobilenet_v2_035_96(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.35 and input image size of 96.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_035_96"]
    model_args = dict(alpha=0.35, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_050_128(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.5 and input image size of 128. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
432
433
434
435
436
437
438
439
@register_model
def mobilenet_v2_050_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.5 and input image size of 128.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_050_128"]
    model_args = dict(alpha=0.5, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_050_160(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.5 and input image size of 160. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
422
423
424
425
426
427
428
429
@register_model
def mobilenet_v2_050_160(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.5 and input image size of 160.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_050_160"]
    model_args = dict(alpha=0.5, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_050_192(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.5 and input image size of 192. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
412
413
414
415
416
417
418
419
@register_model
def mobilenet_v2_050_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.5 and input image size of 192.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_050_192"]
    model_args = dict(alpha=0.5, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_050_224(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.5 and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
402
403
404
405
406
407
408
409
@register_model
def mobilenet_v2_050_224(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.5 and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_050_224"]
    model_args = dict(alpha=0.5, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_050_96(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.5 and input image size of 96. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
442
443
444
445
446
447
448
449
@register_model
def mobilenet_v2_050_96(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.5 and input image size of 96.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_050_96"]
    model_args = dict(alpha=0.5, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_075(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.75 and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
352
353
354
355
356
357
358
359
@register_model
def mobilenet_v2_075(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.75 and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_075"]
    model_args = dict(alpha=0.75, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_075_128(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.75 and input image size of 128. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
382
383
384
385
386
387
388
389
@register_model
def mobilenet_v2_075_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.75 and input image size of 128.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_075_128"]
    model_args = dict(alpha=0.75, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_075_160(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.75 and input image size of 160. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
372
373
374
375
376
377
378
379
@register_model
def mobilenet_v2_075_160(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.75 and input image size of 160.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_075_160"]
    model_args = dict(alpha=0.75, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_075_192(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.75 and input image size of 192. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
362
363
364
365
366
367
368
369
@register_model
def mobilenet_v2_075_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.75 and input image size of 192.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_075_192"]
    model_args = dict(alpha=0.75, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_075_96(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 0.75 and input image size of 96. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
392
393
394
395
396
397
398
399
@register_model
def mobilenet_v2_075_96(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 0.75 and input image size of 96.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_075_96"]
    model_args = dict(alpha=0.75, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model without width scaling and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
302
303
304
305
306
307
308
309
@register_model
def mobilenet_v2_100(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model without width scaling and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_100"]
    model_args = dict(alpha=1.0, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_100_128(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model without width scaling and input image size of 128. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
332
333
334
335
336
337
338
339
@register_model
def mobilenet_v2_100_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model without width scaling and input image size of 128.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_100_128"]
    model_args = dict(alpha=1.0, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_100_160(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model without width scaling and input image size of 160. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
322
323
324
325
326
327
328
329
@register_model
def mobilenet_v2_100_160(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model without width scaling and input image size of 160.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_100_160"]
    model_args = dict(alpha=1.0, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_100_192(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model without width scaling and input image size of 192. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
312
313
314
315
316
317
318
319
@register_model
def mobilenet_v2_100_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model without width scaling and input image size of 192.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_100_192"]
    model_args = dict(alpha=1.0, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_100_96(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model without width scaling and input image size of 96. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
342
343
344
345
346
347
348
349
@register_model
def mobilenet_v2_100_96(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model without width scaling and input image size of 96.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_100_96"]
    model_args = dict(alpha=1.0, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_130_224(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 1.3 and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
292
293
294
295
296
297
298
299
@register_model
def mobilenet_v2_130_224(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 1.3 and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_130_224"]
    model_args = dict(alpha=1.3, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv2.mobilenet_v2_140(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get MobileNetV2 model with width scaled by 1.4 and input image size of 224. Refer to the base class models.MobileNetV2 for more details.

Source code in mindcv\models\mobilenetv2.py
282
283
284
285
286
287
288
289
@register_model
def mobilenet_v2_140(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV2:
    """Get MobileNetV2 model with width scaled by 1.4 and input image size of 224.
    Refer to the base class `models.MobileNetV2` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v2_140"]
    model_args = dict(alpha=1.4, num_classes=num_classes, in_channels=in_channels, **kwargs)
    return _create_mobilenet_v2(pretrained, **dict(default_cfg=default_cfg, **model_args))

mobilenetv3

mindcv.models.mobilenetv3

MindSpore implementation of MobileNetV3. Refer to Searching for MobileNetV3.

mindcv.models.mobilenetv3.Bottleneck

Bases: Cell

Bottleneck Block of MobilenetV3. depth-wise separable convolutions + inverted residual + squeeze excitation

Source code in mindcv\models\mobilenetv3.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class Bottleneck(nn.Cell):
    """Bottleneck Block of MobilenetV3. depth-wise separable convolutions + inverted residual + squeeze excitation"""

    def __init__(
        self,
        in_channels: int,
        mid_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        activation: str = "relu",
        use_se: bool = False,
    ) -> None:
        super().__init__()
        self.use_se = use_se
        self.use_res_connect = stride == 1 and in_channels == out_channels
        assert activation in ["relu", "hswish"]
        self.activation = nn.HSwish if activation == "hswish" else nn.ReLU

        layers = []
        # Expand.
        if in_channels != mid_channels:
            layers.extend([
                nn.Conv2d(in_channels, mid_channels, 1, 1, pad_mode="pad", padding=0, has_bias=False),
                nn.BatchNorm2d(mid_channels),
                self.activation(),
            ])
        # DepthWise.
        layers.extend([
            nn.Conv2d(mid_channels, mid_channels, kernel_size, stride,
                      pad_mode="same", group=mid_channels, has_bias=False),
            nn.BatchNorm2d(mid_channels),
            self.activation(),
        ])
        # SqueezeExcitation.
        if use_se:
            layers.append(
                SqueezeExcite(mid_channels, 1.0 / 4, act_layer=nn.ReLU, gate_layer=nn.HSigmoid)
            )
        # Project.
        layers.extend([
            nn.Conv2d(mid_channels, out_channels, 1, 1, pad_mode="pad", padding=0, has_bias=False),
            nn.BatchNorm2d(out_channels),
        ])
        self.layers = nn.SequentialCell(layers)

    def construct(self, x: Tensor) -> Tensor:
        if self.use_res_connect:
            return x + self.layers(x)
        return self.layers(x)

mindcv.models.mobilenetv3.MobileNetV3

Bases: Cell

MobileNetV3 model class, based on "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>_

PARAMETER DESCRIPTION
arch

size of the architecture. 'small' or 'large'.

TYPE: str

alpha

scale factor of model width. Default: 1.

TYPE: float DEFAULT: 1.0

round_nearest

divisor of make divisible function. Default: 8.

TYPE: int DEFAULT: 8

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\mobilenetv3.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
class MobileNetV3(nn.Cell):
    r"""MobileNetV3 model class, based on
    `"Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244>`_

    Args:
        arch: size of the architecture. 'small' or 'large'.
        alpha: scale factor of model width. Default: 1.
        round_nearest: divisor of make divisible function. Default: 8.
        in_channels: number the channels of the input. Default: 3.
        num_classes: number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        arch: str,
        alpha: float = 1.0,
        round_nearest: int = 8,
        in_channels: int = 3,
        num_classes: int = 1000,
    ) -> None:
        super().__init__()
        input_channels = make_divisible(16 * alpha, round_nearest)
        # Setting of bottleneck blocks. ex: [k, e, c, se, nl, s]
        # k: kernel size of depth-wise conv
        # e: expansion size
        # c: number of output channel
        # se: whether there is a Squeeze-And-Excite in that block
        # nl: type of non-linearity used
        # s: stride of depth-wise conv
        if arch == "large":
            bottleneck_setting = [
                [3, 16, 16, False, "relu", 1],
                [3, 64, 24, False, "relu", 2],
                [3, 72, 24, False, "relu", 1],
                [5, 72, 40, True, "relu", 2],
                [5, 120, 40, True, "relu", 1],
                [5, 120, 40, True, "relu", 1],
                [3, 240, 80, False, "hswish", 2],
                [3, 200, 80, False, "hswish", 1],
                [3, 184, 80, False, "hswish", 1],
                [3, 184, 80, False, "hswish", 1],
                [3, 480, 112, True, "hswish", 1],
                [3, 672, 112, True, "hswish", 1],
                [5, 672, 160, True, "hswish", 2],
                [5, 960, 160, True, "hswish", 1],
                [5, 960, 160, True, "hswish", 1],
            ]
            last_channels = make_divisible(alpha * 1280, round_nearest)
        elif arch == "small":
            bottleneck_setting = [
                [3, 16, 16, True, "relu", 2],
                [3, 72, 24, False, "relu", 2],
                [3, 88, 24, False, "relu", 1],
                [5, 96, 40, True, "hswish", 2],
                [5, 240, 40, True, "hswish", 1],
                [5, 240, 40, True, "hswish", 1],
                [5, 120, 48, True, "hswish", 1],
                [5, 144, 48, True, "hswish", 1],
                [5, 288, 96, True, "hswish", 2],
                [5, 576, 96, True, "hswish", 1],
                [5, 576, 96, True, "hswish", 1],
            ]
            last_channels = make_divisible(alpha * 1024, round_nearest)
        else:
            raise ValueError(f"Unsupported model type {arch}")

        # Building stem conv layer.
        features = [
            nn.Conv2d(in_channels, input_channels, 3, 2, pad_mode="pad", padding=1, has_bias=False),
            nn.BatchNorm2d(input_channels),
            nn.HSwish(),
        ]

        total_reduction = 2
        self.feature_info = [dict(chs=input_channels, reduction=total_reduction, name=f'features.{len(features) - 1}')]

        # Building bottleneck blocks.
        for k, e, c, se, nl, s in bottleneck_setting:
            exp_channels = make_divisible(alpha * e, round_nearest)
            output_channels = make_divisible(alpha * c, round_nearest)
            features.append(Bottleneck(input_channels, exp_channels, output_channels,
                                       kernel_size=k, stride=s, activation=nl, use_se=se))
            input_channels = output_channels

            total_reduction *= s
            self.feature_info.append(dict(chs=input_channels, reduction=total_reduction,
                                          name=f'features.{len(features) - 1}'))

        # Building last point-wise conv layers.
        output_channels = input_channels * 6
        features.extend([
            nn.Conv2d(input_channels, output_channels, 1, 1, pad_mode="pad", padding=0, has_bias=False),
            nn.BatchNorm2d(output_channels),
            nn.HSwish(),
        ])

        self.feature_info.append(dict(chs=output_channels, reduction=total_reduction,
                                      name=f'features.{len(features) - 1}'))
        self.flatten_sequential = True

        self.features = nn.SequentialCell(features)

        self.pool = GlobalAvgPooling()
        self.classifier = nn.SequentialCell([
            nn.Dense(output_channels, last_channels),
            nn.HSwish(),
            Dropout(p=0.2),
            nn.Dense(last_channels, num_classes),
        ])
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                n = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=math.sqrt(2. / n), mean=0.0),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=0.01, mean=0.0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.mobilenetv3.mobilenet_v3_large_075(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get large MobileNetV3 model with width scaled by 0.75. Refer to the base class models.MobileNetV3 for more details.

Source code in mindcv\models\mobilenetv3.py
279
280
281
282
283
284
285
286
@register_model
def mobilenet_v3_large_075(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV3:
    """Get large MobileNetV3 model with width scaled by 0.75.
    Refer to the base class `models.MobileNetV3` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v3_large_075"]
    model_args = dict(arch="large", alpha=0.75, in_channels=in_channels, num_classes=num_classes, **kwargs)
    return _create_mobilenet_v3(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv3.mobilenet_v3_large_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get large MobileNetV3 model without width scaling. Refer to the base class models.MobileNetV3 for more details.

Source code in mindcv\models\mobilenetv3.py
259
260
261
262
263
264
265
266
@register_model
def mobilenet_v3_large_100(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV3:
    """Get large MobileNetV3 model without width scaling.
    Refer to the base class `models.MobileNetV3` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v3_large_100"]
    model_args = dict(arch="large", alpha=1.0, in_channels=in_channels, num_classes=num_classes, **kwargs)
    return _create_mobilenet_v3(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv3.mobilenet_v3_small_075(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get small MobileNetV3 model with width scaled by 0.75. Refer to the base class models.MobileNetV3 for more details.

Source code in mindcv\models\mobilenetv3.py
269
270
271
272
273
274
275
276
@register_model
def mobilenet_v3_small_075(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV3:
    """Get small MobileNetV3 model with width scaled by 0.75.
    Refer to the base class `models.MobileNetV3` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v3_small_075"]
    model_args = dict(arch="small", alpha=0.75, in_channels=in_channels, num_classes=num_classes, **kwargs)
    return _create_mobilenet_v3(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.mobilenetv3.mobilenet_v3_small_100(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get small MobileNetV3 model without width scaling. Refer to the base class models.MobileNetV3 for more details.

Source code in mindcv\models\mobilenetv3.py
249
250
251
252
253
254
255
256
@register_model
def mobilenet_v3_small_100(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> MobileNetV3:
    """Get small MobileNetV3 model without width scaling.
    Refer to the base class `models.MobileNetV3` for more details.
    """
    default_cfg = default_cfgs["mobilenet_v3_small_100"]
    model_args = dict(arch="small", alpha=1.0, in_channels=in_channels, num_classes=num_classes, **kwargs)
    return _create_mobilenet_v3(pretrained, **dict(default_cfg=default_cfg, **model_args))

mobilevit

mindcv.models.mobilevit

MindSpore implementation of MobileViT. Refer to MobileViT:Light-weight, General-purpose, and Mobile-friendly Vision Transformer.

mindcv.models.mobilevit.ConvLayer

Bases: Cell

Conv2d + BN + Act

Source code in mindcv\models\mobilevit.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class ConvLayer(nn.Cell):
    """ Conv2d + BN + Act"""
    def __init__(self,
                 in_channels: int,
                 out_channels: int,
                 kernel_size: int = 3,
                 stride: int = 1,
                 pad_mode: str = "pad",
                 padding: Optional[int] = None,
                 dilation: int = 1,
                 groups: int = 1,
                 norm: Optional[nn.Cell] = nn.BatchNorm2d,
                 activation: Optional[nn.Cell] = nn.SiLU,
                 has_bias: Optional[bool] = False) -> None:
        super().__init__()

        if pad_mode == "pad":
            if padding is None:
                padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
        else:
            padding = 0

        if has_bias is None:
            has_bias = norm is None

        layers = [
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                pad_mode=pad_mode,
                padding=padding,
                dilation=dilation,
                group=groups,
                has_bias=has_bias)
        ]

        if norm:
            layers.append(norm(out_channels, momentum=0.9))
        if activation:
            layers.append(activation())

        self.features = nn.SequentialCell(layers)

    def construct(self, x):
        output = self.features(x)
        return output

mindcv.models.mobilevit.InvertedResidual

Bases: Cell

This class implements the inverted residual block, as described in MobileNetv2 <https://arxiv.org/abs/1801.04381>_ paper

PARAMETER DESCRIPTION
in_channels

:math:C_{in} from an expected input of size :math:(N, C_{in}, H_{in}, W_{in})

TYPE: int

out_channels

:math:C_{out} from an expected output of size :math:(N, C_{out}, H_{out}, W_{out)

TYPE: int

stride

Use convolutions with a stride. Default: 1

TYPE: int

expand_ratio

Expand the input channels by this factor in depth-wise conv

TYPE: Union[int, float]

skip_connection

Use skip-connection. Default: True

TYPE: Optional[bool] DEFAULT: True

Shape
  • Input: :math:(N, C_{in}, H_{in}, W_{in})
  • Output: :math:(N, C_{out}, H_{out}, W_{out})

.. note:: If in_channels =! out_channels and stride > 1, we set skip_connection=False

Source code in mindcv\models\mobilevit.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class InvertedResidual(nn.Cell):
    """
    This class implements the inverted residual block, as described in
    `MobileNetv2 <https://arxiv.org/abs/1801.04381>`_ paper

    Args:
        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H_{in}, W_{in})`
        out_channels (int): :math:`C_{out}` from an expected output of size :math:`(N, C_{out}, H_{out}, W_{out)`
        stride (int): Use convolutions with a stride. Default: 1
        expand_ratio (Union[int, float]): Expand the input channels by this factor in depth-wise conv
        skip_connection (Optional[bool]): Use skip-connection. Default: True

    Shape:
        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
        - Output: :math:`(N, C_{out}, H_{out}, W_{out})`

    .. note::
        If `in_channels =! out_channels` and `stride > 1`, we set `skip_connection=False`

    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int,
        expand_ratio: Union[int, float],
        skip_connection: Optional[bool] = True,
    ) -> None:
        assert stride in [1, 2]
        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)

        super().__init__()

        block = []
        if expand_ratio != 1:
            block.append(
                ConvLayer(
                    in_channels=in_channels,
                    out_channels=hidden_dim,
                    kernel_size=1,
                    norm=nn.BatchNorm2d,
                    activation=nn.SiLU
                ),
            )

        block.append(
            ConvLayer(
                in_channels=hidden_dim,
                out_channels=hidden_dim,
                kernel_size=3,
                stride=stride,
                groups=hidden_dim,
                norm=nn.BatchNorm2d,
                activation=nn.SiLU
            ),
        )

        block.append(
           ConvLayer(
                in_channels=hidden_dim,
                out_channels=out_channels,
                kernel_size=1,
                has_bias=False,
                activation=None
            ),
        )

        self.block = nn.SequentialCell(block)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.exp = expand_ratio
        self.stride = stride
        self.use_res_connect = (
            self.stride == 1 and in_channels == out_channels and skip_connection
        )

    def construct(self, x: Tensor, *args, **kwargs) -> Tensor:
        if self.use_res_connect:
            return x + self.block(x)
        else:
            return self.block(x)

mindcv.models.mobilevit.MobileViT

Bases: Cell

This class implements the MobileViT architecture <https://arxiv.org/abs/2110.02178?context=cs.LG>_

Source code in mindcv\models\mobilevit.py
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
class MobileViT(nn.Cell):
    """
    This class implements the `MobileViT architecture <https://arxiv.org/abs/2110.02178?context=cs.LG>`_
    """
    def __init__(self, model_cfg: Dict, num_classes: int = 1000):
        super().__init__()

        image_channels = 3
        out_channels = 16

        self.conv_1 = ConvLayer(
            in_channels=image_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=2
        )

        self.layer_1, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer1"])
        self.layer_2, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer2"])
        self.layer_3, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer3"])
        self.layer_4, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer4"])
        self.layer_5, out_channels = self._make_layer(input_channel=out_channels, cfg=model_cfg["layer5"])

        exp_channels = min(model_cfg["last_layer_exp_factor"] * out_channels, 960)
        self.conv_1x1_exp = ConvLayer(
            in_channels=out_channels,
            out_channels=exp_channels,
            kernel_size=1
        )

        classifier = []
        classifier.append(GlobalAvgPooling())
        classifier.append(nn.Flatten())
        if 0.0 < model_cfg["cls_dropout"] < 1.0:
            classifier.append(Dropout(p=model_cfg["cls_dropout"]))
        classifier.append(nn.Dense(in_channels=exp_channels, out_channels=num_classes))
        self.classifier = nn.SequentialCell(classifier)
        self._initialize_weights()

    def _make_layer(self, input_channel, cfg: Dict) -> Tuple[nn.SequentialCell, int]:
        block_type = cfg.get("block_type", "mobilevit")
        if block_type.lower() == "mobilevit":
            return self._make_mit_layer(input_channel=input_channel, cfg=cfg)
        else:
            return self._make_mobilenet_layer(input_channel=input_channel, cfg=cfg)

    @staticmethod
    def _make_mobilenet_layer(input_channel: int, cfg: Dict) -> Tuple[nn.SequentialCell, int]:
        output_channels = cfg.get("out_channels")
        num_blocks = cfg.get("num_blocks", 2)
        expand_ratio = cfg.get("expand_ratio", 4)
        block = []

        for i in range(num_blocks):
            stride = cfg.get("stride", 1) if i == 0 else 1

            layer = InvertedResidual(
                in_channels=input_channel,
                out_channels=output_channels,
                stride=stride,
                expand_ratio=expand_ratio
            )
            block.append(layer)
            input_channel = output_channels

        return nn.SequentialCell(block), input_channel

    @staticmethod
    def _make_mit_layer(input_channel: int, cfg: Dict) -> [nn.SequentialCell, int]:
        stride = cfg.get("stride", 1)
        block = []

        if stride == 2:
            layer = InvertedResidual(
                in_channels=input_channel,
                out_channels=cfg.get("out_channels"),
                stride=stride,
                expand_ratio=cfg.get("mv_expand_ratio", 4)
            )

            block.append(layer)
            input_channel = cfg.get("out_channels")

        transformer_dim = cfg["transformer_channels"]
        ffn_dim = cfg.get("ffn_dim")
        num_heads = cfg.get("num_heads", 4)
        head_dim = transformer_dim // num_heads

        if transformer_dim % head_dim != 0:
            raise ValueError("Transformer input dimension should be divisible by head dimension. "
                             "Got {} and {}.".format(transformer_dim, head_dim))

        block.append(MobileViTBlock(
            in_channels=input_channel,
            out_channels=cfg.get("out_channels"),
            transformer_dim=transformer_dim,
            ffn_dim=ffn_dim,
            n_transformer_blocks=cfg.get("transformer_blocks", 1),
            patch_h=cfg.get("patch_h", 2),
            patch_w=cfg.get("patch_w", 2),
            dropout=cfg.get("dropout", 0.1),
            ffn_dropout=cfg.get("ffn_dropout", 0.0),
            attn_dropout=cfg.get("attn_dropout", 0.1),
            head_dim=head_dim,
            conv_ksize=3
        ))

        return nn.SequentialCell(block), input_channel

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                if cell.weight is not None:
                    cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.shape,
                                                          cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer('zeros', cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, (nn.LayerNorm, nn.BatchNorm2d)):
                if cell.gamma is not None:
                    cell.gamma.set_data(init.initializer('ones', cell.gamma.shape, cell.gamma.dtype))
                if cell.beta is not None:
                    cell.beta.set_data(init.initializer('zeros', cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Conv2d):
                if cell.weight is not None:
                    cell.weight.set_data(init.initializer(init.HeNormal(mode='fan_out', nonlinearity='leaky_relu'),
                                         cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer('zeros', cell.bias.shape, cell.bias.dtype))

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv_1(x)
        x = self.layer_1(x)
        x = self.layer_2(x)
        x = self.layer_3(x)
        x = self.layer_4(x)
        x = self.layer_5(x)
        x = self.conv_1x1_exp(x)
        x = self.classifier(x)
        return x

mindcv.models.mobilevit.MobileViTBlock

Bases: Cell

This class defines the MobileViT block <https://arxiv.org/abs/2110.02178?context=cs.LG>_

PARAMETER DESCRIPTION
opts

command line arguments

in_channels

:math:C_{in} from an expected input of size :math:(N, C_{in}, H, W)

TYPE: int

transformer_dim

Input dimension to the transformer unit

TYPE: int

ffn_dim

Dimension of the FFN block

TYPE: int

n_transformer_blocks

Number of transformer blocks. Default: 2

TYPE: int DEFAULT: 2

head_dim

Head dimension in the multi-head attention. Default: 32

TYPE: int DEFAULT: 32

attn_dropout

Dropout in multi-head attention. Default: 0.0

TYPE: float DEFAULT: 0.0

dropout

Dropout rate. Default: 0.0

TYPE: float DEFAULT: 0.0

ffn_dropout

Dropout between FFN layers in transformer. Default: 0.0

TYPE: float DEFAULT: 0.0

patch_h

Patch height for unfolding operation. Default: 8

TYPE: int DEFAULT: 8

patch_w

Patch width for unfolding operation. Default: 8

TYPE: int DEFAULT: 8

transformer_norm_layer

Normalization layer in the transformer block. Default: layer_norm

TYPE: Optional[str]

conv_ksize

Kernel size to learn local representations in MobileViT block. Default: 3

TYPE: int DEFAULT: 3

no_fusion

Do not combine the input and output feature maps. Default: False

TYPE: Optional[bool]

Source code in mindcv\models\mobilevit.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
class MobileViTBlock(nn.Cell):
    """
    This class defines the `MobileViT block <https://arxiv.org/abs/2110.02178?context=cs.LG>`_

    Args:
        opts: command line arguments
        in_channels (int): :math:`C_{in}` from an expected input of size :math:`(N, C_{in}, H, W)`
        transformer_dim (int): Input dimension to the transformer unit
        ffn_dim (int): Dimension of the FFN block
        n_transformer_blocks (int): Number of transformer blocks. Default: 2
        head_dim (int): Head dimension in the multi-head attention. Default: 32
        attn_dropout (float): Dropout in multi-head attention. Default: 0.0
        dropout (float): Dropout rate. Default: 0.0
        ffn_dropout (float): Dropout between FFN layers in transformer. Default: 0.0
        patch_h (int): Patch height for unfolding operation. Default: 8
        patch_w (int): Patch width for unfolding operation. Default: 8
        transformer_norm_layer (Optional[str]): Normalization layer in the transformer block. Default: layer_norm
        conv_ksize (int): Kernel size to learn local representations in MobileViT block. Default: 3
        no_fusion (Optional[bool]): Do not combine the input and output feature maps. Default: False
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        transformer_dim: int,
        ffn_dim: int,
        n_transformer_blocks: int = 2,
        head_dim: int = 32,
        attn_dropout: float = 0.0,
        dropout: float = 0.0,
        ffn_dropout: float = 0.0,
        patch_h: int = 8,
        patch_w: int = 8,
        conv_ksize: Optional[int] = 3,
        *args,
        **kwargs
    ) -> None:
        super().__init__()

        conv_3x3_in = ConvLayer(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=conv_ksize,
            stride=1
        )
        conv_1x1_in = ConvLayer(
            in_channels=in_channels,
            out_channels=transformer_dim,
            kernel_size=1,
            stride=1
        )

        conv_1x1_out = ConvLayer(
            in_channels=transformer_dim,
            out_channels=in_channels,
            kernel_size=1,
            stride=1
        )
        conv_3x3_out = ConvLayer(
            in_channels=2 * in_channels,
            out_channels=out_channels,
            kernel_size=conv_ksize,
            stride=1,
            pad_mode="pad",
            padding=1
        )

        local_rep = []
        local_rep.append(conv_3x3_in)
        local_rep.append(conv_1x1_in)
        self.local_rep = nn.SequentialCell(local_rep)

        assert transformer_dim % head_dim == 0
        num_heads = transformer_dim // head_dim

        self.global_rep = [
            TransformerEncoder(
                embed_dim=transformer_dim,
                ffn_latent_dim=ffn_dim,
                num_heads=num_heads,
                attn_dropout=attn_dropout,
                dropout=dropout,
                ffn_dropout=ffn_dropout
            )
            for _ in range(n_transformer_blocks)
        ]
        self.global_rep.append(nn.LayerNorm((transformer_dim,)))
        self.global_rep = nn.CellList(self.global_rep)

        self.conv_proj = conv_1x1_out
        self.fusion = conv_3x3_out

        self.patch_h = patch_h
        self.patch_w = patch_w
        self.patch_area = self.patch_w * self.patch_h

        self.cnn_in_dim = in_channels
        self.cnn_out_dim = transformer_dim
        self.n_heads = num_heads
        self.ffn_dim = ffn_dim
        self.dropout = dropout
        self.attn_dropout = attn_dropout
        self.ffn_dropout = ffn_dropout
        self.n_blocks = n_transformer_blocks
        self.conv_ksize = conv_ksize
        self.interpolate = Interpolate(mode="bilinear", align_corners=True)

    def unfolding(self, x: Tensor) -> Tuple[Tensor, Dict]:
        patch_w, patch_h = self.patch_w, self.patch_h
        patch_area = patch_w * patch_h
        batch_size, in_channels, orig_h, orig_w = x.shape

        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)

        interpolate = False
        if new_w != orig_w or new_h != orig_h:
            # Note: Padding can be done, but then it needs to be handled in attention function.
            x = self.interpolate(x, size=(new_h, new_w))
            interpolate = True

        # number of patches along width and height
        num_patch_w = new_w // patch_w  # n_w
        num_patch_h = new_h // patch_h  # n_h
        num_patches = num_patch_h * num_patch_w  # N

        # [B, C, H, W] -> [B * C * n_h, p_h, n_w, p_w]
        x = ops.reshape(x, (batch_size * in_channels * num_patch_h, patch_h, num_patch_w, patch_w))
        # [B * C * n_h, p_h, n_w, p_w] -> [B * C * n_h, n_w, p_h, p_w]
        x = ops.transpose(x, (0, 2, 1, 3))
        # [B * C * n_h, n_w, p_h, p_w] -> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
        x = ops.reshape(x, (batch_size, in_channels, num_patches, patch_area))
        # [B, C, N, P] -> [B, P, N, C]
        x = ops.transpose(x, (0, 3, 2, 1))
        # [B, P, N, C] -> [BP, N, C]
        x = ops.reshape(x, (batch_size * patch_area, num_patches, -1))

        info_dict = {
            "orig_size": (orig_h, orig_w),
            "batch_size": batch_size,
            "interpolate": interpolate,
            "total_patches": num_patches,
            "num_patches_w": num_patch_w,
            "num_patches_h": num_patch_h,
        }

        return x, info_dict

    def folding(self, x: Tensor, info_dict: Dict) -> Tensor:
        n_dim = ops.rank(x)
        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
            x.shape
        )
        # [BP, N, C] --> [B, P, N, C]
        x = x.view(
            info_dict["batch_size"], self.patch_area, info_dict["total_patches"], -1
        )

        batch_size, pixels, num_patches, channels = x.shape
        num_patch_h = info_dict["num_patches_h"]
        num_patch_w = info_dict["num_patches_w"]

        # [B, P, N, C] -> [B, C, N, P]
        x = ops.transpose(x, (0, 3, 2, 1))
        # [B, C, N, P] -> [B*C*n_h, n_w, p_h, p_w]
        x = ops.reshape(x, (batch_size * channels * num_patch_h, num_patch_w, self.patch_h, self.patch_w))
        # [B*C*n_h, n_w, p_h, p_w] -> [B*C*n_h, p_h, n_w, p_w]
        x = ops.transpose(x, (0, 2, 1, 3))
        # [B*C*n_h, p_h, n_w, p_w] -> [B, C, H, W]
        x = ops.reshape(x, (batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w))
        if info_dict["interpolate"]:
            x = self.interpolate(x, size=info_dict["orig_size"])
        return x

    def construct(self, x: Tensor) -> Tensor:
        res = x
        fm = self.local_rep(x)
        # convert feature map to patches
        patches, info_dict = self.unfolding(fm)
        # learn global representations
        for transformer_layer in self.global_rep:
            patches = transformer_layer(patches)
        # [B x Patch x Patches x C] -> [B x C x Patches x Patch]
        fm = self.folding(x=patches, info_dict=info_dict)
        fm = self.conv_proj(fm)
        fm = self.fusion(ops.concat((res, fm), 1))
        return fm

mindcv.models.mobilevit.MultiHeadAttention

Bases: Cell

This layer applies a multi-head self- or cross-attention as described in Attention is all you need <https://arxiv.org/abs/1706.03762>_ paper

PARAMETER DESCRIPTION
embed_dim

:math:C_{in} from an expected input of size :math:(N, P, C_{in})

TYPE: int

num_heads

Number of heads in multi-head attention

TYPE: int

attn_dropout

Attention dropout. Default: 0.0

TYPE: float DEFAULT: 0.0

bias

Use bias or not. Default: True

TYPE: bool DEFAULT: True

Shape
  • Input: :math:(N, P, C_{in}) where :math:N is batch size, :math:P is number of patches, and :math:C_{in} is input embedding dim
  • Output: same shape as the input
Source code in mindcv\models\mobilevit.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class MultiHeadAttention(nn.Cell):
    """
    This layer applies a multi-head self- or cross-attention as described in
    `Attention is all you need <https://arxiv.org/abs/1706.03762>`_ paper

    Args:
        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`
        num_heads (int): Number of heads in multi-head attention
        attn_dropout (float): Attention dropout. Default: 0.0
        bias (bool): Use bias or not. Default: ``True``

    Shape:
        - Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
        and :math:`C_{in}` is input embedding dim
        - Output: same shape as the input

    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        attn_dropout: float = 0.0,
        bias: bool = True,
        *args,
        **kwargs
    ) -> None:
        super().__init__()
        if embed_dim % num_heads != 0:
            raise ValueError(
                "Embedding dim must be divisible by number of heads in {}. Got: embed_dim={} and num_heads={}".format(
                    self.__class__.__name__, embed_dim, num_heads
                )
            )

        self.qkv_proj = nn.Dense(in_channels=embed_dim, out_channels=embed_dim * 3, has_bias=bias)

        self.attn_dropout = Dropout(p=attn_dropout)
        self.out_proj = nn.Dense(in_channels=embed_dim, out_channels=embed_dim, has_bias=bias)

        self.head_dim = embed_dim // num_heads
        self.scaling = self.head_dim ** -0.5
        self.softmax = nn.Softmax(axis=-1)
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.batch_matmul = ops.BatchMatMul()

    def construct(self, x: Tensor) -> Tensor:
        B, N, C = x.shape
        qkv = self.qkv_proj(x)
        qkv = ops.reshape(qkv, (B, N, 3, self.num_heads, C // self.num_heads))
        qkv = ops.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = ops.BatchMatMul(transpose_b=True)(q, k) * self.scaling
        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_dropout(attn)

        x = ops.transpose(ops.BatchMatMul()(attn, v), (0, 2, 1, 3))
        x = ops.reshape(x, (B, N, C))
        x = self.out_proj(x)
        return x

mindcv.models.mobilevit.TransformerEncoder

Bases: Cell

This class defines the pre-norm Transformer encoder <https://arxiv.org/abs/1706.03762>_ Args: embed_dim (int): :math:C_{in} from an expected input of size :math:(N, P, C_{in}) ffn_latent_dim (int): Inner dimension of the FFN num_heads (int) : Number of heads in multi-head attention. Default: 8 attn_dropout (float): Dropout rate for attention in multi-head attention. Default: 0.0 dropout (float): Dropout rate. Default: 0.0 ffn_dropout (float): Dropout between FFN layers. Default: 0.0

Shape
  • Input: :math:(N, P, C_{in}) where :math:N is batch size, :math:P is number of patches, and :math:C_{in} is input embedding dim
  • Output: same shape as the input
Source code in mindcv\models\mobilevit.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
class TransformerEncoder(nn.Cell):
    """
    This class defines the pre-norm `Transformer encoder <https://arxiv.org/abs/1706.03762>`_
    Args:
        embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(N, P, C_{in})`
        ffn_latent_dim (int): Inner dimension of the FFN
        num_heads (int) : Number of heads in multi-head attention. Default: 8
        attn_dropout (float): Dropout rate for attention in multi-head attention. Default: 0.0
        dropout (float): Dropout rate. Default: 0.0
        ffn_dropout (float): Dropout between FFN layers. Default: 0.0

    Shape:
        - Input: :math:`(N, P, C_{in})` where :math:`N` is batch size, :math:`P` is number of patches,
        and :math:`C_{in}` is input embedding dim
        - Output: same shape as the input
    """

    def __init__(
        self,
        embed_dim: int,
        ffn_latent_dim: int,
        num_heads: Optional[int] = 8,
        attn_dropout: Optional[float] = 0.0,
        dropout: Optional[float] = 0.0,
        ffn_dropout: Optional[float] = 0.0,
        *args,
        **kwargs
    ) -> None:

        super().__init__()

        attn_unit = MultiHeadAttention(
            embed_dim,
            num_heads,
            attn_dropout=attn_dropout,
            bias=True
        )

        self.pre_norm_mha = nn.SequentialCell(
            nn.LayerNorm((embed_dim,)),
            attn_unit,
            Dropout(p=dropout)
        )

        self.pre_norm_ffn = nn.SequentialCell(
            nn.LayerNorm((embed_dim,)),
            nn.Dense(in_channels=embed_dim, out_channels=ffn_latent_dim, has_bias=True),
            nn.SiLU(),
            Dropout(p=ffn_dropout),
            nn.Dense(in_channels=ffn_latent_dim, out_channels=embed_dim, has_bias=True),
            Dropout(p=dropout)
        )
        self.embed_dim = embed_dim
        self.ffn_dim = ffn_latent_dim
        self.ffn_dropout = ffn_dropout
        self.std_dropout = dropout

    def construct(self, x: Tensor) -> Tensor:
        # multi-head attention
        res = x
        x = self.pre_norm_mha(x)
        x = x + res

        # feed forward network
        x = x + self.pre_norm_ffn(x)
        return x

nasnet

mindcv.models.nasnet

MindSpore implementation of NasNet. Refer to: Learning Transferable Architectures for Scalable Image Recognition

mindcv.models.nasnet.BranchSeparables

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class BranchSeparables(nn.Cell):
    """NasNet model basic architecture"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        padding: int,
        bias: bool = False,
    ) -> None:
        super().__init__()
        self.relu = nn.ReLU()
        self.separable_1 = SeparableConv2d(
            in_channels, in_channels, kernel_size, stride, padding, bias=bias
        )
        self.bn_sep_1 = nn.BatchNorm2d(num_features=in_channels, eps=0.001, momentum=0.9, affine=True)
        self.relu1 = nn.ReLU()
        self.separable_2 = SeparableConv2d(
            in_channels, out_channels, kernel_size, 1, padding, bias=bias
        )
        self.bn_sep_2 = nn.BatchNorm2d(num_features=out_channels, eps=0.001, momentum=0.9, affine=True)

    def construct(self, x: Tensor) -> Tensor:
        x = self.relu(x)
        x = self.separable_1(x)
        x = self.bn_sep_1(x)
        x = self.relu1(x)
        x = self.separable_2(x)
        x = self.bn_sep_2(x)
        return x

mindcv.models.nasnet.BranchSeparablesReduction

Bases: BranchSeparables

NasNet model Residual Connections

Source code in mindcv\models\nasnet.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
class BranchSeparablesReduction(BranchSeparables):
    """NasNet model Residual Connections"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        padding: int,
        z_padding: int = 1,
        bias: bool = False,
    ) -> None:
        BranchSeparables.__init__(
            self, in_channels, out_channels, kernel_size, stride, padding, bias
        )
        self.padding = nn.Pad(paddings=((0, 0), (0, 0), (z_padding, 0), (z_padding, 0)), mode="CONSTANT")

    def construct(self, x: Tensor) -> Tensor:
        x = self.relu(x)
        x = self.padding(x)
        x = self.separable_1(x)
        x = x[:, :, 1:, 1:]
        x = self.bn_sep_1(x)
        x = self.relu1(x)
        x = self.separable_2(x)
        x = self.bn_sep_2(x)
        return x

mindcv.models.nasnet.BranchSeparablesStem

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class BranchSeparablesStem(nn.Cell):
    """NasNet model basic architecture"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        padding: int,
        bias: bool = False,
    ) -> None:
        super().__init__()
        self.relu = nn.ReLU()
        self.separable_1 = SeparableConv2d(
            in_channels, out_channels, kernel_size, stride, padding, bias=bias
        )
        self.bn_sep_1 = nn.BatchNorm2d(num_features=out_channels, eps=0.001, momentum=0.9, affine=True)
        self.relu1 = nn.ReLU()
        self.separable_2 = SeparableConv2d(
            out_channels, out_channels, kernel_size, 1, padding, bias=bias
        )
        self.bn_sep_2 = nn.BatchNorm2d(num_features=out_channels, eps=0.001, momentum=0.9, affine=True)

    def construct(self, x: Tensor) -> Tensor:
        x = self.relu(x)
        x = self.separable_1(x)
        x = self.bn_sep_1(x)
        x = self.relu1(x)
        x = self.separable_2(x)
        x = self.bn_sep_2(x)
        return x

mindcv.models.nasnet.CellStem0

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class CellStem0(nn.Cell):
    """NasNet model basic architecture"""

    def __init__(
        self,
        stem_filters: int,
        num_filters: int = 42,
    ) -> None:
        super().__init__()
        self.num_filters = num_filters
        self.stem_filters = stem_filters
        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=self.stem_filters, out_channels=self.num_filters, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=self.num_filters, eps=0.001, momentum=0.9, affine=True)
        ])

        self.comb_iter_0_left = BranchSeparables(
            self.num_filters, self.num_filters, 5, 2, 2
        )
        self.comb_iter_0_right = BranchSeparablesStem(
            self.stem_filters, self.num_filters, 7, 2, 3, bias=False
        )

        self.comb_iter_1_left = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.comb_iter_1_right = BranchSeparablesStem(
            self.stem_filters, self.num_filters, 7, 2, 3, bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.comb_iter_2_right = BranchSeparablesStem(
            self.stem_filters, self.num_filters, 5, 2, 2, bias=False
        )

        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparables(
            self.num_filters, self.num_filters, 3, 1, 1, bias=False
        )
        self.comb_iter_4_right = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")

    def construct(self, x: Tensor) -> Tensor:
        x1 = self.conv_1x1(x)

        x_comb_iter_0_left = self.comb_iter_0_left(x1)
        x_comb_iter_0_right = self.comb_iter_0_right(x)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x1)
        x_comb_iter_1_right = self.comb_iter_1_right(x)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x1)
        x_comb_iter_2_right = self.comb_iter_2_right(x)
        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right

        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1

        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
        x_comb_iter_4_right = self.comb_iter_4_right(x1)
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right

        x_out = ops.concat((x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.CellStem1

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
class CellStem1(nn.Cell):
    """NasNet model basic architecture"""

    def __init__(
        self,
        stem_filters: int,
        num_filters: int,
    ) -> None:
        super().__init__()
        self.num_filters = num_filters
        self.stem_filters = stem_filters
        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=2 * self.num_filters, out_channels=self.num_filters, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=self.num_filters, eps=0.001, momentum=0.9, affine=True)])

        self.relu = nn.ReLU()
        self.path_1 = nn.SequentialCell([
            nn.AvgPool2d(kernel_size=1, stride=2, pad_mode="valid"),
            nn.Conv2d(in_channels=self.stem_filters, out_channels=self.num_filters // 2, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False)])

        self.path_2 = nn.CellList([])
        self.path_2.append(nn.Pad(paddings=((0, 0), (0, 0), (0, 1), (0, 1)), mode="CONSTANT"))
        self.path_2.append(
            nn.AvgPool2d(kernel_size=1, stride=2, pad_mode="valid")
        )
        self.path_2.append(
            nn.Conv2d(in_channels=self.stem_filters, out_channels=self.num_filters // 2, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False)
        )

        self.final_path_bn = nn.BatchNorm2d(num_features=self.num_filters, eps=0.001, momentum=0.9, affine=True)

        self.comb_iter_0_left = BranchSeparables(
            self.num_filters,
            self.num_filters,
            5,
            2,
            2,
            bias=False
        )
        self.comb_iter_0_right = BranchSeparables(
            self.num_filters,
            self.num_filters,
            7,
            2,
            3,
            bias=False
        )

        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_1_right = BranchSeparables(
            self.num_filters,
            self.num_filters,
            7,
            2,
            3,
            bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_2_right = BranchSeparables(
            self.num_filters,
            self.num_filters,
            5,
            2,
            2,
            bias=False
        )

        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparables(
            self.num_filters,
            self.num_filters,
            3,
            1,
            1,
            bias=False
        )
        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, pad_mode="same")

    def construct(self, x_conv0: Tensor, x_stem_0: Tensor) -> Tensor:
        x_left = self.conv_1x1(x_stem_0)
        x_relu = self.relu(x_conv0)
        # path 1
        x_path1 = self.path_1(x_relu)
        # path 2
        x_path2 = self.path_2[0](x_relu)
        x_path2 = x_path2[:, :, 1:, 1:]
        x_path2 = self.path_2[1](x_path2)
        x_path2 = self.path_2[2](x_path2)
        # final path
        x_right = self.final_path_bn(ops.concat((x_path1, x_path2), axis=1))

        x_comb_iter_0_left = self.comb_iter_0_left(x_left)
        x_comb_iter_0_right = self.comb_iter_0_right(x_right)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
        x_comb_iter_1_right = self.comb_iter_1_right(x_right)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x_left)
        x_comb_iter_2_right = self.comb_iter_2_right(x_right)
        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right

        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1

        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
        x_comb_iter_4_right = self.comb_iter_4_right(x_left)
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right

        x_out = ops.concat((x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.FirstCell

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
class FirstCell(nn.Cell):
    """NasNet model basic architecture"""

    def __init__(
        self,
        in_channels_left: int,
        out_channels_left: int,
        in_channels_right: int,
        out_channels_right: int,
    ) -> None:
        super().__init__()
        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_right, out_channels=out_channels_right, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_right, eps=0.001, momentum=0.9, affine=True)])

        self.relu = nn.ReLU()
        self.path_1 = nn.SequentialCell([
            nn.AvgPool2d(kernel_size=1, stride=2, pad_mode="valid"),
            nn.Conv2d(in_channels=in_channels_left, out_channels=out_channels_left, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False)])

        self.path_2 = nn.CellList([])
        self.path_2.append(nn.Pad(paddings=((0, 0), (0, 0), (0, 1), (0, 1)), mode="CONSTANT"))
        self.path_2.append(
            nn.AvgPool2d(kernel_size=1, stride=2, pad_mode="valid")
        )
        self.path_2.append(
            nn.Conv2d(in_channels=in_channels_left, out_channels=out_channels_left, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False)
        )

        self.final_path_bn = nn.BatchNorm2d(num_features=out_channels_left * 2, eps=0.001, momentum=0.9, affine=True)

        self.comb_iter_0_left = BranchSeparables(
            out_channels_right, out_channels_right, 5, 1, 2, bias=False
        )
        self.comb_iter_0_right = BranchSeparables(
            out_channels_right, out_channels_right, 3, 1, 1, bias=False
        )

        self.comb_iter_1_left = BranchSeparables(
            out_channels_right, out_channels_right, 5, 1, 2, bias=False
        )
        self.comb_iter_1_right = BranchSeparables(
            out_channels_right, out_channels_right, 3, 1, 1, bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_3_left = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")
        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparables(
            out_channels_right, out_channels_right, 3, 1, 1, bias=False
        )

    def construct(self, x: Tensor, x_prev: Tensor) -> Tensor:
        x_relu = self.relu(x_prev)
        x_path1 = self.path_1(x_relu)
        x_path2 = self.path_2[0](x_relu)
        x_path2 = x_path2[:, :, 1:, 1:]
        x_path2 = self.path_2[1](x_path2)
        x_path2 = self.path_2[2](x_path2)
        # final path
        x_left = self.final_path_bn(ops.concat((x_path1, x_path2), axis=1))

        x_right = self.conv_1x1(x)

        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
        x_comb_iter_2 = x_comb_iter_2_left + x_left

        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right

        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
        x_comb_iter_4 = x_comb_iter_4_left + x_right

        x_out = ops.concat((x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.NASNetAMobile

Bases: Cell

NasNet model class, based on "Learning Transferable Architectures for Scalable Image Recognition" <https://arxiv.org/pdf/1707.07012v4.pdf>_ Args: num_classes: number of classification classes. stem_filters: number of stem filters. Default: 32. penultimate_filters: number of penultimate filters. Default: 1056. filters_multiplier: size of filters multiplier. Default: 2.

Source code in mindcv\models\nasnet.py
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
class NASNetAMobile(nn.Cell):
    r"""NasNet model class, based on
    `"Learning Transferable Architectures for Scalable Image Recognition" <https://arxiv.org/pdf/1707.07012v4.pdf>`_
    Args:
        num_classes: number of classification classes.
        stem_filters: number of stem filters. Default: 32.
        penultimate_filters: number of penultimate filters. Default: 1056.
        filters_multiplier: size of filters multiplier. Default: 2.
    """

    def __init__(
        self,
        in_channels: int = 3,
        num_classes: int = 1000,
        stem_filters: int = 32,
        penultimate_filters: int = 1056,
        filters_multiplier: int = 2,
    ) -> None:
        super().__init__()
        self.stem_filters = stem_filters
        self.penultimate_filters = penultimate_filters
        self.filters_multiplier = filters_multiplier

        filters = self.penultimate_filters // 24
        # 24 is default value for the architecture

        self.conv0 = nn.SequentialCell([
            nn.Conv2d(in_channels=in_channels, out_channels=self.stem_filters, kernel_size=3, stride=2, pad_mode="pad",
                      padding=0,
                      has_bias=False),
            nn.BatchNorm2d(num_features=self.stem_filters, eps=0.001, momentum=0.9, affine=True)
        ])

        self.cell_stem_0 = CellStem0(
            self.stem_filters, num_filters=filters // (filters_multiplier ** 2)
        )
        self.cell_stem_1 = CellStem1(
            self.stem_filters, num_filters=filters // filters_multiplier
        )

        self.cell_0 = FirstCell(
            in_channels_left=filters,
            out_channels_left=filters // 2,  # 1, 0.5
            in_channels_right=2 * filters,
            out_channels_right=filters,
        )  # 2, 1
        self.cell_1 = NormalCell(
            in_channels_left=2 * filters,
            out_channels_left=filters,  # 2, 1
            in_channels_right=6 * filters,
            out_channels_right=filters,
        )  # 6, 1
        self.cell_2 = NormalCell(
            in_channels_left=6 * filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=6 * filters,
            out_channels_right=filters,
        )  # 6, 1
        self.cell_3 = NormalCell(
            in_channels_left=6 * filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=6 * filters,
            out_channels_right=filters,
        )  # 6, 1

        self.reduction_cell_0 = ReductionCell0(
            in_channels_left=6 * filters,
            out_channels_left=2 * filters,  # 6, 2
            in_channels_right=6 * filters,
            out_channels_right=2 * filters,
        )  # 6, 2

        self.cell_6 = FirstCell(
            in_channels_left=6 * filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=8 * filters,
            out_channels_right=2 * filters,
        )  # 8, 2
        self.cell_7 = NormalCell(
            in_channels_left=8 * filters,
            out_channels_left=2 * filters,  # 8, 2
            in_channels_right=12 * filters,
            out_channels_right=2 * filters,
        )  # 12, 2
        self.cell_8 = NormalCell(
            in_channels_left=12 * filters,
            out_channels_left=2 * filters,  # 12, 2
            in_channels_right=12 * filters,
            out_channels_right=2 * filters,
        )  # 12, 2
        self.cell_9 = NormalCell(
            in_channels_left=12 * filters,
            out_channels_left=2 * filters,  # 12, 2
            in_channels_right=12 * filters,
            out_channels_right=2 * filters,
        )  # 12, 2

        self.reduction_cell_1 = ReductionCell1(
            in_channels_left=12 * filters,
            out_channels_left=4 * filters,  # 12, 4
            in_channels_right=12 * filters,
            out_channels_right=4 * filters,
        )  # 12, 4

        self.cell_12 = FirstCell(
            in_channels_left=12 * filters,
            out_channels_left=2 * filters,  # 12, 2
            in_channels_right=16 * filters,
            out_channels_right=4 * filters,
        )  # 16, 4
        self.cell_13 = NormalCell(
            in_channels_left=16 * filters,
            out_channels_left=4 * filters,  # 16, 4
            in_channels_right=24 * filters,
            out_channels_right=4 * filters,
        )  # 24, 4
        self.cell_14 = NormalCell(
            in_channels_left=24 * filters,
            out_channels_left=4 * filters,  # 24, 4
            in_channels_right=24 * filters,
            out_channels_right=4 * filters,
        )  # 24, 4
        self.cell_15 = NormalCell(
            in_channels_left=24 * filters,
            out_channels_left=4 * filters,  # 24, 4
            in_channels_right=24 * filters,
            out_channels_right=4 * filters,
        )  # 24, 4

        self.relu = nn.ReLU()
        self.dropout = Dropout(p=0.5)
        self.classifier = nn.Dense(in_channels=24 * filters, out_channels=num_classes)
        self.pool = GlobalAvgPooling()
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights for cells."""
        self.init_parameters_data()
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                n = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                cell.weight.set_data(init.initializer(init.Normal(math.sqrt(2. / n), 0),
                                                      cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Zero(), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer(init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer(init.Zero(), cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.Normal(0.01, 0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Zero(), cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        """Network forward feature extraction."""
        x_conv0 = self.conv0(x)
        x_stem_0 = self.cell_stem_0(x_conv0)
        x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)

        x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
        x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
        x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
        x_cell_3 = self.cell_3(x_cell_2, x_cell_1)

        x_reduction_cell_0 = self.reduction_cell_0(x_cell_3, x_cell_2)

        x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_3)
        x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
        x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
        x_cell_9 = self.cell_9(x_cell_8, x_cell_7)

        x_reduction_cell_1 = self.reduction_cell_1(x_cell_9, x_cell_8)

        x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_9)
        x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
        x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
        x_cell_15 = self.cell_15(x_cell_14, x_cell_13)

        x_cell_15 = self.relu(x_cell_15)
        return x_cell_15

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)  # global average pool
        x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x
mindcv.models.nasnet.NASNetAMobile.forward_features(x)

Network forward feature extraction.

Source code in mindcv\models\nasnet.py
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
def forward_features(self, x: Tensor) -> Tensor:
    """Network forward feature extraction."""
    x_conv0 = self.conv0(x)
    x_stem_0 = self.cell_stem_0(x_conv0)
    x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0)

    x_cell_0 = self.cell_0(x_stem_1, x_stem_0)
    x_cell_1 = self.cell_1(x_cell_0, x_stem_1)
    x_cell_2 = self.cell_2(x_cell_1, x_cell_0)
    x_cell_3 = self.cell_3(x_cell_2, x_cell_1)

    x_reduction_cell_0 = self.reduction_cell_0(x_cell_3, x_cell_2)

    x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_3)
    x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0)
    x_cell_8 = self.cell_8(x_cell_7, x_cell_6)
    x_cell_9 = self.cell_9(x_cell_8, x_cell_7)

    x_reduction_cell_1 = self.reduction_cell_1(x_cell_9, x_cell_8)

    x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_9)
    x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1)
    x_cell_14 = self.cell_14(x_cell_13, x_cell_12)
    x_cell_15 = self.cell_15(x_cell_14, x_cell_13)

    x_cell_15 = self.relu(x_cell_15)
    return x_cell_15

mindcv.models.nasnet.NormalCell

Bases: Cell

NasNet model basic architecture

Source code in mindcv\models\nasnet.py
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
class NormalCell(nn.Cell):
    """NasNet model basic architecture"""
    def __init__(self,
                 in_channels_left: int,
                 out_channels_left: int,
                 in_channels_right: int,
                 out_channels_right: int) -> None:
        super().__init__()
        self.conv_prev_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_left, out_channels=out_channels_left, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_left, eps=0.001, momentum=0.9, affine=True)])

        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_right, out_channels=out_channels_right, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_right, eps=0.001, momentum=0.9, affine=True)])

        self.comb_iter_0_left = BranchSeparables(
            out_channels_right, out_channels_right, 5, 1, 2, bias=False
        )
        self.comb_iter_0_right = BranchSeparables(
            out_channels_left, out_channels_left, 3, 1, 1, bias=False
        )

        self.comb_iter_1_left = BranchSeparables(
            out_channels_left, out_channels_left, 5, 1, 2, bias=False
        )
        self.comb_iter_1_right = BranchSeparables(
            out_channels_left, out_channels_left, 3, 1, 1, bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_3_left = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")
        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparables(
            out_channels_right, out_channels_right, 3, 1, 1, bias=False
        )

    def construct(self, x: Tensor, x_prev: Tensor) -> Tensor:
        x_left = self.conv_prev_1x1(x_prev)
        x_right = self.conv_1x1(x)

        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x_left)
        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
        x_comb_iter_2 = x_comb_iter_2_left + x_left

        x_comb_iter_3_left = self.comb_iter_3_left(x_left)
        x_comb_iter_3_right = self.comb_iter_3_right(x_left)
        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right

        x_comb_iter_4_left = self.comb_iter_4_left(x_right)
        x_comb_iter_4 = x_comb_iter_4_left + x_right

        x_out = ops.concat((x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.ReductionCell0

Bases: Cell

NasNet model Residual Connections

Source code in mindcv\models\nasnet.py
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
class ReductionCell0(nn.Cell):
    """NasNet model Residual Connections"""

    def __init__(
        self,
        in_channels_left: int,
        out_channels_left: int,
        in_channels_right: int,
        out_channels_right: int,
    ) -> None:
        super().__init__()
        self.conv_prev_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_left, out_channels=out_channels_left, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_left, eps=0.001, momentum=0.9, affine=True)])

        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_right, out_channels=out_channels_right, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_right, eps=0.001, momentum=0.9, affine=True)])

        self.comb_iter_0_left = BranchSeparablesReduction(
            out_channels_right, out_channels_right, 5, 2, 2, bias=False
        )
        self.comb_iter_0_right = BranchSeparablesReduction(
            out_channels_right, out_channels_right, 7, 2, 3, bias=False
        )

        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_1_right = BranchSeparablesReduction(
            out_channels_right, out_channels_right, 7, 2, 3, bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_2_right = BranchSeparablesReduction(
            out_channels_right, out_channels_right, 5, 2, 2, bias=False
        )

        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparablesReduction(
            out_channels_right, out_channels_right, 3, 1, 1, bias=False
        )
        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, pad_mode="same")

    def construct(self, x: Tensor, x_prev: Tensor) -> Tensor:
        x_left = self.conv_prev_1x1(x_prev)
        x_right = self.conv_1x1(x)

        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right

        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1

        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right

        x_out = ops.concat((x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.ReductionCell1

Bases: Cell

NasNet model Residual Connections

Source code in mindcv\models\nasnet.py
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
class ReductionCell1(nn.Cell):
    """NasNet model Residual Connections"""

    def __init__(
        self,
        in_channels_left: int,
        out_channels_left: int,
        in_channels_right: int,
        out_channels_right: int,
    ) -> None:
        super().__init__()
        self.conv_prev_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_left, out_channels=out_channels_left, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_left, eps=0.001, momentum=0.9, affine=True)])

        self.conv_1x1 = nn.SequentialCell([
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channels_right, out_channels=out_channels_right, kernel_size=1, stride=1,
                      pad_mode="pad", has_bias=False),
            nn.BatchNorm2d(num_features=out_channels_right, eps=0.001, momentum=0.9, affine=True)])

        self.comb_iter_0_left = BranchSeparables(
            out_channels_right,
            out_channels_right,
            5,
            2,
            2,
            bias=False
        )
        self.comb_iter_0_right = BranchSeparables(
            out_channels_right,
            out_channels_right,
            7,
            2,
            3,
            bias=False
        )

        self.comb_iter_1_left = nn.MaxPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_1_right = BranchSeparables(
            out_channels_right,
            out_channels_right,
            7,
            2,
            3,
            bias=False
        )

        self.comb_iter_2_left = nn.AvgPool2d(3, stride=2, pad_mode="same")
        self.comb_iter_2_right = BranchSeparables(
            out_channels_right,
            out_channels_right,
            5,
            2,
            2,
            bias=False
        )

        self.comb_iter_3_right = nn.AvgPool2d(kernel_size=3, stride=1, pad_mode="same")

        self.comb_iter_4_left = BranchSeparables(
            out_channels_right,
            out_channels_right,
            3,
            1,
            1,
            bias=False
        )
        self.comb_iter_4_right = nn.MaxPool2d(3, stride=2, pad_mode="same")

    def construct(self, x: Tensor, x_prev: Tensor) -> Tensor:
        x_left = self.conv_prev_1x1(x_prev)
        x_right = self.conv_1x1(x)

        x_comb_iter_0_left = self.comb_iter_0_left(x_right)
        x_comb_iter_0_right = self.comb_iter_0_right(x_left)
        x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right

        x_comb_iter_1_left = self.comb_iter_1_left(x_right)
        x_comb_iter_1_right = self.comb_iter_1_right(x_left)
        x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right

        x_comb_iter_2_left = self.comb_iter_2_left(x_right)
        x_comb_iter_2_right = self.comb_iter_2_right(x_left)
        x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right

        x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0)
        x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1

        x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0)
        x_comb_iter_4_right = self.comb_iter_4_right(x_right)
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right

        x_out = ops.concat((x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4), axis=1)
        return x_out

mindcv.models.nasnet.SeparableConv2d

Bases: Cell

depth-wise convolutions + point-wise convolutions

Source code in mindcv\models\nasnet.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class SeparableConv2d(nn.Cell):
    """depth-wise convolutions + point-wise convolutions"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        dw_kernel: int,
        dw_stride: int,
        dw_padding: int,
        bias: bool = False,
    ) -> None:
        super().__init__()
        self.depthwise_conv2d = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=dw_kernel,
                                          stride=dw_stride, pad_mode="pad", padding=dw_padding, group=in_channels,
                                          has_bias=bias)
        self.pointwise_conv2d = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1,
                                          pad_mode="pad", has_bias=bias)

    def construct(self, x: Tensor) -> Tensor:
        x = self.depthwise_conv2d(x)
        x = self.pointwise_conv2d(x)
        return x

mindcv.models.nasnet.nasnet_a_4x1056(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get NasNet model. Refer to the base class models.NASNetAMobile for more details.

Source code in mindcv\models\nasnet.py
874
875
876
877
878
879
880
881
882
@register_model
def nasnet_a_4x1056(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> NASNetAMobile:
    """Get NasNet model.
    Refer to the base class `models.NASNetAMobile` for more details."""
    default_cfg = default_cfgs["nasnet_a_4x1056"]
    model = NASNetAMobile(in_channels=in_channels, num_classes=num_classes, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

pit

mindcv.models.pit

MindSpore implementation of PiT. Refer to Rethinking Spatial Dimensions of Vision Transformers.

mindcv.models.pit.Attention

Bases: Cell

define multi-head self attention block

Source code in mindcv\models\pit.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class Attention(nn.Cell):
    """define multi-head self attention block"""

    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5
        # get pair-wise relative position index for each token inside the window
        self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.softmax = nn.Softmax(axis=-1)

        self.batchmatmul = ops.BatchMatMul()

    def construct(self, x):
        B, N, C = x.shape
        q = ops.reshape(self.q(x), (B, N, self.num_heads, C // self.num_heads)) * self.scale
        q = ops.transpose(q, (0, 2, 1, 3))
        k = ops.reshape(self.k(x), (B, N, self.num_heads, C // self.num_heads))
        k = ops.transpose(k, (0, 2, 3, 1))
        v = ops.reshape(self.v(x), (B, N, self.num_heads, C // self.num_heads))
        v = ops.transpose(v, (0, 2, 1, 3))

        attn = self.batchmatmul(q, k)
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        x = self.batchmatmul(attn, v)
        x = ops.reshape(ops.transpose(x, (0, 2, 1, 3)), (B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

mindcv.models.pit.Block

Bases: Cell

define the basic block of PiT

Source code in mindcv\models\pit.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class Block(nn.Cell):
    """define the basic block of PiT"""

    def __init__(
        self,
        dim: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        drop_path: float = 0.0,
        act_layer: nn.cell = nn.GELU,
        norm_layer: nn.cell = nn.LayerNorm,
    ) -> None:
        super().__init__()
        self.norm1 = norm_layer((dim,), epsilon=1e-6)
        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer((dim,), epsilon=1e-6)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def construct(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.pit.Mlp

Bases: Cell

MLP as used in Vision Transformer, MLP-Mixer and related networks

Source code in mindcv\models\pit.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
class Mlp(nn.Cell):
    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""

    def __init__(
        self,
        in_features: int,
        hidden_features: int = None,
        out_features: int = None,
        act_layer: nn.cell = nn.GELU,
        drop: float = 0.0,
    ) -> None:
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True)
        self.act = act_layer()
        self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True)
        self.drop = Dropout(p=drop)

    def construct(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

mindcv.models.pit.PoolingTransformer

Bases: Cell

PiT model class, based on "Rethinking Spatial Dimensions of Vision Transformers" <https://arxiv.org/abs/2103.16302> Args: image_size (int) : images input size. patch_size (int) : image patch size. stride (int) : stride of the depthwise conv. base_dims (List[int]) : middle dim of each layer. depth (List[int]) : model block depth of each layer. heads (List[int]) : number of heads of multi-head attention of each layer mlp_ratio (float) : ratio of hidden features in Mlp. num_classes (int) : number of classification classes. Default: 1000. in_chans (int) : number the channels of the input. Default: 3. attn_drop_rate (float) : attention layers dropout rate. Default: 0. drop_rate (float) : dropout rate. Default: 0. drop_path_rate (float) : drop path rate. Default: 0.

Source code in mindcv\models\pit.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
class PoolingTransformer(nn.Cell):
    r"""PiT model class, based on
    `"Rethinking Spatial Dimensions of Vision Transformers"
    <https://arxiv.org/abs/2103.16302>`
    Args:
        image_size (int) : images input size.
        patch_size (int) : image patch size.
        stride (int) : stride of the depthwise conv.
        base_dims (List[int]) : middle dim of each layer.
        depth (List[int]) : model block depth of each layer.
        heads (List[int]) : number of heads of multi-head attention of each layer
        mlp_ratio (float) : ratio of hidden features in Mlp.
        num_classes (int) : number of classification classes. Default: 1000.
        in_chans (int) : number the channels of the input. Default: 3.
        attn_drop_rate (float) : attention layers dropout rate. Default: 0.
        drop_rate (float) : dropout rate. Default: 0.
        drop_path_rate (float) : drop path rate. Default: 0.
    """

    def __init__(
        self,
        image_size: int,
        patch_size: int,
        stride: int,
        base_dims: List[int],
        depth: List[int],
        heads: List[int],
        mlp_ratio: float,
        num_classes: int = 1000,
        in_chans: int = 3,
        attn_drop_rate: float = 0.0,
        drop_rate: float = 0.0,
        drop_path_rate: float = 0.0,
    ) -> None:
        super().__init__()

        total_block = sum(depth)
        padding = 0
        block_idx = 0

        width = math.floor((image_size + 2 * padding - patch_size) / stride + 1)

        self.base_dims = base_dims
        self.heads = heads
        self.num_classes = num_classes

        self.patch_size = patch_size
        self.pos_embed = Parameter(Tensor(np.random.randn(1, base_dims[0] * heads[0], width, width), mstype.float32))
        self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding)
        self.cls_token = Parameter(Tensor(np.random.randn(1, 1, base_dims[0] * heads[0]), mstype.float32))

        self.pos_drop = Dropout(p=drop_rate)
        self.tile = ops.Tile()

        self.transformers = nn.CellList([])
        self.pools = nn.CellList([])

        for stage in range(len(depth)):
            drop_path_prob = [drop_path_rate * i / total_block for i in range(block_idx, block_idx + depth[stage])]
            block_idx += depth[stage]
            self.transformers.append(
                Transformer(
                    base_dims[stage], depth[stage], heads[stage], mlp_ratio, drop_rate, attn_drop_rate, drop_path_prob
                )
            )
            if stage < len(heads) - 1:
                self.pools.append(
                    conv_head_pooling(
                        base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2
                    )
                )

        self.norm = nn.LayerNorm((base_dims[-1] * heads[-1],), epsilon=1e-6)

        self.embed_dim = base_dims[-1] * heads[-1]

        # Classifier head
        if num_classes > 0:
            self.head = nn.Dense(in_channels=base_dims[-1] * heads[-1], out_channels=num_classes, has_bias=True)
        else:
            self.head = Identity()

        self.pos_embed.set_data(
            init.initializer(init.TruncatedNormal(sigma=0.02), self.pos_embed.shape, self.pos_embed.dtype)
        )
        self.cls_token.set_data(
            init.initializer(init.TruncatedNormal(sigma=0.02), self.cls_token.shape, self.cls_token.dtype)
        )
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """init_weights"""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer(init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer(init.Zero(), cell.beta.shape, cell.beta.dtype))
            if isinstance(cell, nn.Conv2d):
                n = cell.kernel_size[0] * cell.kernel_size[1] * cell.in_channels
                cell.weight.set_data(
                    init.initializer(init.Uniform(math.sqrt(1.0 / n)), cell.weight.shape, cell.weight.dtype)
                )
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer(init.Uniform(math.sqrt(1.0 / n)), cell.bias.shape, cell.bias.dtype)
                    )
            if isinstance(cell, nn.Dense):
                init_range = 1.0 / np.sqrt(cell.weight.shape[0])
                cell.weight.set_data(init.initializer(init.Uniform(init_range), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Uniform(init_range), cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)

        pos_embed = self.pos_embed
        x = self.pos_drop(x + pos_embed)

        cls_tokens = self.tile(self.cls_token, (x.shape[0], 1, 1))

        for stage in range(len(self.pools)):
            x, cls_tokens = self.transformers[stage](x, cls_tokens)
            x, cls_tokens = self.pools[stage](x, cls_tokens)
        x, cls_tokens = self.transformers[-1](x, cls_tokens)

        cls_tokens = self.norm(cls_tokens)

        return cls_tokens

    def forward_head(self, x: Tensor) -> Tensor:
        cls_token = self.head(x[:, 0])
        return cls_token

    def construct(self, x: Tensor) -> Tensor:
        cls_token = self.forward_features(x)
        cls_token = self.forward_head(cls_token)
        return cls_token

mindcv.models.pit.Transformer

Bases: Cell

define the transformer block of PiT

Source code in mindcv\models\pit.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
class Transformer(nn.Cell):
    """define the transformer block of PiT"""

    def __init__(
        self,
        base_dim: List[int],
        depth: List[int],
        heads: List[int],
        mlp_ratio: float,
        drop_rate: float = 0.0,
        attn_drop_rate: float = 0.0,
        drop_path_prob: float = None,
    ) -> None:
        super().__init__()
        self.layers = nn.CellList([])
        embed_dim = base_dim * heads

        if drop_path_prob is None:
            drop_path_prob = [0.0 for _ in range(depth)]

        self.blocks = nn.CellList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=True,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=drop_path_prob[i],
                    norm_layer=nn.LayerNorm,
                )
                for i in range(depth)
            ]
        )

    def construct(self, x, cls_tokens):
        h, w = x.shape[2:4]
        x = ops.reshape(x, (x.shape[0], x.shape[1], h * w))
        x = ops.transpose(x, (0, 2, 1))
        token_length = cls_tokens.shape[1]
        x = ops.concat((cls_tokens, x), axis=1)
        for blk in self.blocks:
            x = blk(x)

        cls_tokens = x[:, :token_length]
        x = x[:, token_length:]
        x = ops.transpose(x, (0, 2, 1))
        x = ops.reshape(x, (x.shape[0], x.shape[1], h, w))
        return x, cls_tokens

mindcv.models.pit.conv_embedding

Bases: Cell

define embedding layer using conv2d

Source code in mindcv\models\pit.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class conv_embedding(nn.Cell):
    """define embedding layer using conv2d"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        patch_size: int,
        stride: int,
        padding: int,
    ) -> None:
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=patch_size,
            stride=stride,
            pad_mode="pad",
            padding=padding,
            has_bias=True,
        )

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv(x)
        return x

mindcv.models.pit.conv_head_pooling

Bases: Cell

define pooling layer using conv in spatial tokens with an additional fully-connected layer (to adjust the channel size to match the spatial tokens)

Source code in mindcv\models\pit.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class conv_head_pooling(nn.Cell):
    """define pooling layer using conv in spatial tokens with an additional fully-connected layer
    (to adjust the channel size to match the spatial tokens)"""

    def __init__(
        self,
        in_feature: int,
        out_feature: int,
        stride: int,
        pad_mode: str = "pad",
    ) -> None:
        super().__init__()
        self.conv = nn.Conv2d(
            in_feature,
            out_feature,
            kernel_size=stride + 1,
            padding=stride // 2,
            stride=stride,
            pad_mode=pad_mode,
            group=in_feature,
            has_bias=True,
        )
        self.fc = nn.Dense(in_channels=in_feature, out_channels=out_feature, has_bias=True)

    def construct(self, x, cls_token):
        x = self.conv(x)
        cls_token = self.fc(cls_token)

        return x, cls_token

mindcv.models.pit.pit_b(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PiT-B model. Refer to the base class models.PoolingTransformer for more details.

Source code in mindcv\models\pit.py
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
@register_model
def pit_b(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolingTransformer:
    """Get PiT-B model.
    Refer to the base class `models.PoolingTransformer` for more details."""
    default_cfg = default_cfgs["pit_b"]
    model = PoolingTransformer(
        image_size=224,
        patch_size=14,
        stride=7,
        base_dims=[64, 64, 64],
        depth=[3, 6, 4],
        heads=[4, 8, 16],
        mlp_ratio=4.0,
        num_classes=num_classes,
        in_chans=in_channels,
        **kwargs
    )

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pit.pit_s(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PiT-S model. Refer to the base class models.PoolingTransformer for more details.

Source code in mindcv\models\pit.py
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
@register_model
def pit_s(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolingTransformer:
    """Get PiT-S model.
    Refer to the base class `models.PoolingTransformer` for more details."""
    default_cfg = default_cfgs["pit_s"]
    model = PoolingTransformer(
        image_size=224,
        patch_size=16,
        stride=8,
        base_dims=[48, 48, 48],
        depth=[2, 6, 4],
        heads=[3, 6, 12],
        mlp_ratio=4.0,
        num_classes=num_classes,
        in_chans=in_channels,
        **kwargs
    )

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pit.pit_ti(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PiT-Ti model. Refer to the base class models.PoolingTransformer for more details.

Source code in mindcv\models\pit.py
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
@register_model
def pit_ti(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolingTransformer:
    """Get PiT-Ti model.
    Refer to the base class `models.PoolingTransformer` for more details."""
    default_cfg = default_cfgs["pit_ti"]
    model = PoolingTransformer(
        image_size=224,
        patch_size=16,
        stride=8,
        base_dims=[32, 32, 32],
        depth=[2, 6, 4],
        heads=[2, 4, 8],
        mlp_ratio=4.0,
        num_classes=num_classes,
        in_chans=in_channels,
        **kwargs
    )

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pit.pit_xs(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PiT-XS model. Refer to the base class models.PoolingTransformer for more details.

Source code in mindcv\models\pit.py
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
@register_model
def pit_xs(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolingTransformer:
    """Get PiT-XS model.
    Refer to the base class `models.PoolingTransformer` for more details."""
    default_cfg = default_cfgs["pit_xs"]
    model = PoolingTransformer(
        image_size=224,
        patch_size=16,
        stride=8,
        base_dims=[48, 48, 48],
        depth=[2, 6, 4],
        heads=[2, 4, 8],
        mlp_ratio=4.0,
        num_classes=num_classes,
        in_chans=in_channels,
        **kwargs
    )

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

poolformer

mindcv.models.poolformer

MindSpore implementation of poolformer. Refer to PoolFormer: MetaFormer Is Actually What You Need for Vision.

mindcv.models.poolformer.ConvMlp

Bases: Cell

MLP using 1x1 convs that keeps spatial dims

Source code in mindcv\models\poolformer.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class ConvMlp(nn.Cell):
    """MLP using 1x1 convs that keeps spatial dims"""

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        norm_layer=None,
        bias=True,
        drop=0.0,
    ):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        bias = to_2tuple(bias)

        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, has_bias=bias[0])
        self.norm = norm_layer(hidden_features) if norm_layer else Identity()
        self.act = act_layer(approximate=False)
        self.drop = Dropout(p=drop)
        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, has_bias=bias[1])
        self.cls_init_weights()

    def cls_init_weights(self):
        """Initialize weights for cells."""
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Conv2d):
                m.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
                if m.bias is not None:
                    m.bias.set_data(
                        init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

    def construct(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
mindcv.models.poolformer.ConvMlp.cls_init_weights()

Initialize weights for cells.

Source code in mindcv\models\poolformer.py
88
89
90
91
92
93
94
95
96
def cls_init_weights(self):
    """Initialize weights for cells."""
    for name, m in self.cells_and_names():
        if isinstance(m, nn.Conv2d):
            m.weight.set_data(
                init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
            if m.bias is not None:
                m.bias.set_data(
                    init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

mindcv.models.poolformer.PatchEmbed

Bases: Cell

Patch Embedding that is implemented by a layer of conv. Input: tensor in shape [B, C, H, W] Output: tensor in shape [B, C, H/stride, W/stride]

Source code in mindcv\models\poolformer.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class PatchEmbed(nn.Cell):
    """Patch Embedding that is implemented by a layer of conv.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H/stride, W/stride]"""

    def __init__(self, in_chs=3, embed_dim=768, patch_size=16, stride=16, padding=0, norm_layer=None):
        super().__init__()
        patch_size = to_2tuple(patch_size)
        stride = to_2tuple(stride)
        # padding = to_2tuple(padding)
        self.proj = nn.Conv2d(in_chs, embed_dim, kernel_size=patch_size, stride=stride, padding=padding, pad_mode="pad",
                              has_bias=True)
        self.norm = norm_layer(embed_dim) if norm_layer else Identity()

    def construct(self, x):
        x = self.proj(x)
        x = self.norm(x)
        return x

mindcv.models.poolformer.PoolFormer

Bases: Cell

PoolFormer model class, based on "MetaFormer Is Actually What You Need for Vision" <https://arxiv.org/pdf/2111.11418v3.pdf>_

PARAMETER DESCRIPTION
layers

number of blocks for the 4 stages

embed_dims

the embedding dims for the 4 stages. Default: (64, 128, 320, 512)

DEFAULT: (64, 128, 320, 512)

mlp_ratios

mlp ratios for the 4 stages. Default: (4, 4, 4, 4)

DEFAULT: (4, 4, 4, 4)

downsamples

flags to apply downsampling or not. Default: (True, True, True, True)

DEFAULT: (True, True, True, True)

pool_size

the pooling size for the 4 stages. Default: 3

DEFAULT: 3

in_chans

number of input channels. Default: 3

DEFAULT: 3

num_classes

number of classes for the image classification. Default: 1000

DEFAULT: 1000

global_pool

define the types of pooling layer. Default: avg

DEFAULT: 'avg'

norm_layer

define the types of normalization. Default: nn.GroupNorm

DEFAULT: GroupNorm

act_layer

define the types of activation. Default: nn.GELU

DEFAULT: GELU

in_patch_size

specify the patch embedding for the input image. Default: 7

DEFAULT: 7

in_stride

specify the stride for the input image. Default: 4.

DEFAULT: 4

in_pad

specify the pad for the input image. Default: 2.

DEFAULT: 2

down_patch_size

specify the downsample. Default: 3.

DEFAULT: 3

down_stride

specify the downsample (patch embed.). Default: 2.

DEFAULT: 2

down_pad

specify the downsample (patch embed.). Default: 1.

DEFAULT: 1

drop_rate

dropout rate of the layer before main classifier. Default: 0.

DEFAULT: 0.0

drop_path_rate

Stochastic Depth. Default: 0.

DEFAULT: 0.0

layer_scale_init_value

LayerScale. Default: 1e-5.

DEFAULT: 1e-05

fork_feat

whether output features of the 4 stages, for dense prediction. Default: False.

DEFAULT: False

Source code in mindcv\models\poolformer.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
class PoolFormer(nn.Cell):
    r"""PoolFormer model class, based on
    `"MetaFormer Is Actually What You Need for Vision" <https://arxiv.org/pdf/2111.11418v3.pdf>`_

    Args:
        layers: number of blocks for the 4 stages
        embed_dims: the embedding dims for the 4 stages. Default: (64, 128, 320, 512)
        mlp_ratios: mlp ratios for the 4 stages. Default: (4, 4, 4, 4)
        downsamples: flags to apply downsampling or not. Default: (True, True, True, True)
        pool_size: the pooling size for the 4 stages. Default: 3
        in_chans: number of input channels. Default: 3
        num_classes: number of classes for the image classification. Default: 1000
        global_pool: define the types of pooling layer. Default: avg
        norm_layer: define the types of normalization. Default: nn.GroupNorm
        act_layer: define the types of activation. Default: nn.GELU
        in_patch_size: specify the patch embedding for the input image. Default: 7
        in_stride: specify the stride for the input image. Default: 4.
        in_pad: specify the pad for the input image. Default: 2.
        down_patch_size: specify the downsample. Default: 3.
        down_stride: specify the downsample (patch embed.). Default: 2.
        down_pad: specify the downsample (patch embed.). Default: 1.
        drop_rate: dropout rate of the layer before main classifier. Default: 0.
        drop_path_rate: Stochastic Depth. Default: 0.
        layer_scale_init_value: LayerScale. Default: 1e-5.
        fork_feat: whether output features of the 4 stages, for dense prediction. Default: False.
    """

    def __init__(
        self,
        layers,
        embed_dims=(64, 128, 320, 512),
        mlp_ratios=(4, 4, 4, 4),
        downsamples=(True, True, True, True),
        pool_size=3,
        in_chans=3,
        num_classes=1000,
        global_pool="avg",
        norm_layer=nn.GroupNorm,
        act_layer=nn.GELU,
        in_patch_size=7,
        in_stride=4,
        in_pad=2,
        down_patch_size=3,
        down_stride=2,
        down_pad=1,
        drop_rate=0.0,
        drop_path_rate=0.0,
        layer_scale_init_value=1e-5,
        fork_feat=False,
    ):
        super().__init__()

        if not fork_feat:
            self.num_classes = num_classes
        self.fork_feat = fork_feat

        self.global_pool = global_pool
        self.num_features = embed_dims[-1]
        self.grad_checkpointing = False

        self.patch_embed = PatchEmbed(
            patch_size=in_patch_size, stride=in_stride, padding=in_pad,
            in_chs=in_chans, embed_dim=embed_dims[0])

        # set the main block in network
        network = []
        for i in range(len(layers)):
            network.append(basic_blocks(
                embed_dims[i], i, layers,
                pool_size=pool_size, mlp_ratio=mlp_ratios[i],
                act_layer=act_layer, norm_layer=norm_layer,
                drop_rate=drop_rate, drop_path_rate=drop_path_rate,
                layer_scale_init_value=layer_scale_init_value)
            )
            if i < len(layers) - 1 and (downsamples[i] or embed_dims[i] != embed_dims[i + 1]):
                # downsampling between stages
                network.append(PatchEmbed(
                    in_chs=embed_dims[i], embed_dim=embed_dims[i + 1],
                    patch_size=down_patch_size, stride=down_stride, padding=down_pad)
                )

        self.network = nn.SequentialCell(*network)
        self.norm = norm_layer(1, embed_dims[-1])
        self.head = nn.Dense(embed_dims[-1], num_classes, has_bias=True) if num_classes > 0 else Identity()
        # self._initialize_weights()
        self.cls_init_weights()

    def cls_init_weights(self):
        """Initialize weights for cells."""
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Dense):
                m.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
                if m.bias is not None:
                    m.bias.set_data(
                        init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.global_pool = global_pool
        self.head = nn.Dense(self.num_features, num_classes) if num_classes > 0 else Identity()

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        x = self.network(x)
        if self.fork_feat:
            # otuput features of four stages for dense prediction
            return x
        x = self.norm(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        return self.head(x.mean([-2, -1]))

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        return self.forward_head(x)
mindcv.models.poolformer.PoolFormer.cls_init_weights()

Initialize weights for cells.

Source code in mindcv\models\poolformer.py
291
292
293
294
295
296
297
298
299
def cls_init_weights(self):
    """Initialize weights for cells."""
    for name, m in self.cells_and_names():
        if isinstance(m, nn.Dense):
            m.weight.set_data(
                init.initializer(init.TruncatedNormal(sigma=.02), m.weight.shape, m.weight.dtype))
            if m.bias is not None:
                m.bias.set_data(
                    init.initializer(init.Constant(0), m.bias.shape, m.bias.dtype))

mindcv.models.poolformer.PoolFormerBlock

Bases: Cell

Implementation of one PoolFormer block.

Source code in mindcv\models\poolformer.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class PoolFormerBlock(nn.Cell):
    """Implementation of one PoolFormer block."""

    def __init__(
        self,
        dim,
        pool_size=3,
        mlp_ratio=4.0,
        act_layer=nn.GELU,
        norm_layer=nn.GroupNorm,
        drop=0.0,
        drop_path=0.0,
        layer_scale_init_value=1e-5,
    ):
        super().__init__()
        self.norm1 = norm_layer(1, dim)
        self.token_mixer = Pooling(pool_size=pool_size)
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer(1, dim)
        self.mlp = ConvMlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)

        if layer_scale_init_value:
            layer_scale_init_tensor = Tensor(layer_scale_init_value * np.ones([dim]).astype(np.float32))
            self.layer_scale_1 = mindspore.Parameter(layer_scale_init_tensor)
            self.layer_scale_2 = mindspore.Parameter(layer_scale_init_tensor)
        else:
            self.layer_scale_1 = None
            self.layer_scale_2 = None
        self.expand_dims = ops.ExpandDims()

    def construct(self, x):
        if self.layer_scale_1 is not None:
            x = x + self.drop_path(
                self.expand_dims(self.expand_dims(self.layer_scale_1, -1), -1) * self.token_mixer(self.norm1(x)))
            x = x + self.drop_path(
                self.expand_dims(self.expand_dims(self.layer_scale_2, -1), -1) * self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(self.token_mixer(self.norm1(x)))
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.poolformer.basic_blocks(dim, index, layers, pool_size=3, mlp_ratio=4.0, act_layer=nn.GELU, norm_layer=nn.GroupNorm, drop_rate=0.0, drop_path_rate=0.0, layer_scale_init_value=1e-05)

generate PoolFormer blocks for a stage

Source code in mindcv\models\poolformer.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def basic_blocks(
    dim,
    index,
    layers,
    pool_size=3,
    mlp_ratio=4.0,
    act_layer=nn.GELU,
    norm_layer=nn.GroupNorm,
    drop_rate=0.0,
    drop_path_rate=0.0,
    layer_scale_init_value=1e-5,
):
    """generate PoolFormer blocks for a stage"""
    blocks = []
    for block_idx in range(layers[index]):
        block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1)
        blocks.append(PoolFormerBlock(
            dim, pool_size=pool_size, mlp_ratio=mlp_ratio,
            act_layer=act_layer, norm_layer=norm_layer,
            drop=drop_rate, drop_path=block_dpr,
            layer_scale_init_value=layer_scale_init_value,
        ))
    blocks = nn.SequentialCell(*blocks)
    return blocks

mindcv.models.poolformer.poolformer_m36(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get poolformer_m36 model. Refer to the base class models.PoolFormer for more details.

Source code in mindcv\models\poolformer.py
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
@register_model
def poolformer_m36(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_m36 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_m36"]
    layers = (6, 6, 18, 6)
    embed_dims = (96, 192, 384, 768)
    model = PoolFormer(
        in_chans=in_channels,
        num_classes=num_classes,
        layers=layers,
        layer_scale_init_value=1e-6,
        embed_dims=embed_dims,
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.poolformer.poolformer_m48(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get poolformer_m48 model. Refer to the base class models.PoolFormer for more details.

Source code in mindcv\models\poolformer.py
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
@register_model
def poolformer_m48(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_m48 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_m48"]
    layers = (8, 8, 24, 8)
    embed_dims = (96, 192, 384, 768)
    model = PoolFormer(
        in_chans=in_channels,
        num_classes=num_classes,
        layers=layers,
        layer_scale_init_value=1e-6,
        embed_dims=embed_dims,
        **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.poolformer.poolformer_s12(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get poolformer_s12 model. Refer to the base class models.PoolFormer for more details.

Source code in mindcv\models\poolformer.py
324
325
326
327
328
329
330
331
332
@register_model
def poolformer_s12(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s12 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s12"]
    model = PoolFormer(in_chans=in_channels, num_classes=num_classes, layers=(2, 2, 6, 2), **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.poolformer.poolformer_s24(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get poolformer_s24 model. Refer to the base class models.PoolFormer for more details.

Source code in mindcv\models\poolformer.py
335
336
337
338
339
340
341
342
343
@register_model
def poolformer_s24(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s24 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s24"]
    model = PoolFormer(in_chans=in_channels, num_classes=num_classes, layers=(4, 4, 12, 4), **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.poolformer.poolformer_s36(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get poolformer_s36 model. Refer to the base class models.PoolFormer for more details.

Source code in mindcv\models\poolformer.py
346
347
348
349
350
351
352
353
354
355
356
@register_model
def poolformer_s36(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs) -> PoolFormer:
    """Get poolformer_s36 model.
    Refer to the base class `models.PoolFormer` for more details."""
    default_cfg = default_cfgs["poolformer_s36"]
    model = PoolFormer(
        in_chans=in_channels, num_classes=num_classes, layers=(6, 6, 18, 6), layer_scale_init_value=1e-6, **kwargs
    )
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

pvt

mindcv.models.pvt

MindSpore implementation of PVT. Refer to PVT: Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions

mindcv.models.pvt.Attention

Bases: Cell

spatial-reduction attention (SRA)

Source code in mindcv\models\pvt.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
class Attention(nn.Cell):
    """spatial-reduction attention (SRA)"""

    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        qk_scale: Optional[float] = None,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        sr_ratio: int = 1,
    ):
        super(Attention, self).__init__()
        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."

        self.dim = dim
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.q = nn.Dense(dim, dim, has_bias=qkv_bias)
        self.kv = nn.Dense(dim, dim * 2, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.qk_batmatmul = ops.BatchMatMul(transpose_b=True)
        self.batmatmul = ops.BatchMatMul()
        self.softmax = nn.Softmax(axis=-1)
        self.reshape = ops.reshape
        self.transpose = ops.transpose

        self.sr_ratio = sr_ratio
        if sr_ratio > 1:
            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio, has_bias=True)
            self.norm = nn.LayerNorm([dim])

    def construct(self, x, H, W):
        B, N, C = x.shape
        q = self.q(x)
        q = self.reshape(q, (B, N, self.num_heads, C // self.num_heads))
        q = self.transpose(q, (0, 2, 1, 3))
        if self.sr_ratio > 1:
            x_ = self.reshape(self.transpose(x, (0, 2, 1)), (B, C, H, W))

            x_ = self.transpose(self.reshape(self.sr(x_), (B, C, -1)), (0, 2, 1))
            x_ = self.norm(x_)
            kv = self.kv(x_)

            kv = self.transpose(self.reshape(kv, (B, -1, 2, self.num_heads, C // self.num_heads)), (2, 0, 3, 1, 4))
        else:
            kv = self.kv(x)
            kv = self.transpose(self.reshape(kv, (B, -1, 2, self.num_heads, C // self.num_heads)), (2, 0, 3, 1, 4))
        k, v = kv[0], kv[1]
        attn = self.qk_batmatmul(q, k) * self.scale
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)
        x = self.batmatmul(attn, v)
        x = self.reshape(self.transpose(x, (0, 2, 1, 3)), (B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

mindcv.models.pvt.Block

Bases: Cell

Block with spatial-reduction attention (SRA) and feed forward

Source code in mindcv\models\pvt.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
class Block(nn.Cell):
    """ Block with spatial-reduction attention (SRA) and feed forward"""
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
        super(Block, self).__init__()
        self.norm1 = norm_layer([dim], epsilon=1e-5)
        self.attn = Attention(
            dim,
            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer([dim])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def construct(self, x, H, W):
        x1 = self.norm1(x)
        x1 = self.attn(x1, H, W)
        x = x + self.drop_path(x1)
        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x

mindcv.models.pvt.PatchEmbed

Bases: Cell

Image to Patch Embedding

Source code in mindcv\models\pvt.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class PatchEmbed(nn.Cell):
    """Image to Patch Embedding"""

    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()

        img_size = (img_size, img_size)
        patch_size = (patch_size, patch_size)

        self.img_size = img_size
        self.patch_size = patch_size

        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
        self.num_patches = self.H * self.W
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, has_bias=True)
        self.norm = nn.LayerNorm([embed_dim], epsilon=1e-5)
        self.reshape = ops.reshape
        self.transpose = ops.transpose

    def construct(self, x):
        B, C, H, W = x.shape

        x = self.proj(x)
        b, c, h, w = x.shape
        x = self.reshape(x, (b, c, h * w))
        x = self.transpose(x, (0, 2, 1))
        x = self.norm(x)
        H, W = H // self.patch_size[0], W // self.patch_size[1]

        return x, (H, W)

mindcv.models.pvt.PyramidVisionTransformer

Bases: Cell

Pyramid Vision Transformer model class, based on "Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" <https://arxiv.org/abs/2102.12122>_ # noqa: E501

PARAMETER DESCRIPTION
img_size(int)

size of a input image.

patch_size

size of a single image patch.

TYPE: int) DEFAULT: 4

in_chans

number the channels of the input. Default: 3.

TYPE: int) DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

embed_dims

how many hidden dim in each PatchEmbed.

TYPE: list) DEFAULT: [64, 128, 320, 512]

num_heads

number of attention head in each stage.

TYPE: list) DEFAULT: [1, 2, 5, 8]

mlp_ratios

ratios of MLP hidden dims in each stage.

TYPE: list DEFAULT: [8, 8, 4, 4]

qkv_bias(bool)

use bias in attention.

qk_scale(float)

Scale multiplied by qk in attention(if not none), otherwise head_dim ** -0.5.

drop_rate(float)

The drop rate for each block. Default: 0.0.

attn_drop_rate(float)

The drop rate for attention. Default: 0.0.

drop_path_rate(float)

The drop rate for drop path. Default: 0.0.

norm_layer(nn.Cell)

Norm layer that will be used in blocks. Default: nn.LayerNorm.

depths

number of Blocks.

TYPE: list) DEFAULT: [2, 2, 2, 2]

sr_ratios(list)

stride and kernel size of each attention.

num_stages(int)

number of stage. Default: 4.

Source code in mindcv\models\pvt.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
class PyramidVisionTransformer(nn.Cell):
    r"""Pyramid Vision Transformer model class, based on
    `"Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions" <https://arxiv.org/abs/2102.12122>`_  # noqa: E501

    Args:
        img_size(int) : size of a input image.
        patch_size (int) : size of a single image patch.
        in_chans (int) : number the channels of the input. Default: 3.
        num_classes (int) : number of classification classes. Default: 1000.
        embed_dims (list) : how many hidden dim in each PatchEmbed.
        num_heads (list) : number of attention head in each stage.
        mlp_ratios (list): ratios of MLP hidden dims in each stage.
        qkv_bias(bool) : use bias in attention.
        qk_scale(float) : Scale multiplied by qk in attention(if not none), otherwise head_dim ** -0.5.
        drop_rate(float) : The drop rate for each block. Default: 0.0.
        attn_drop_rate(float) : The drop rate for attention. Default: 0.0.
        drop_path_rate(float) : The drop rate for drop path. Default: 0.0.
        norm_layer(nn.Cell) : Norm layer that will be used in blocks. Default: nn.LayerNorm.
        depths (list) : number of Blocks.
        sr_ratios(list) : stride and kernel size of each attention.
        num_stages(int) : number of stage. Default: 4.
    """

    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 320, 512],
                 num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True, qk_scale=None, drop_rate=0.0,
                 attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm,
                 depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], num_stages=4):
        super(PyramidVisionTransformer, self).__init__()
        self.num_classes = num_classes
        self.depths = depths
        self.num_stages = num_stages
        dpr = [x.item() for x in np.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        cur = 0
        b_list = []
        self.pos_embed = []
        self.pos_drop = Dropout(p=drop_rate)
        for i in range(num_stages):
            block = nn.CellList(
                [Block(dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
                       qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j],
                       norm_layer=norm_layer, sr_ratio=sr_ratios[i])
                 for j in range(depths[i])
                 ])

            b_list.append(block)
            cur += depths[0]

        self.patch_embed1 = PatchEmbed(img_size=img_size,
                                       patch_size=patch_size,
                                       in_chans=in_chans,
                                       embed_dim=embed_dims[0])
        num_patches = self.patch_embed1.num_patches
        self.pos_embed1 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[0]), mindspore.float16))
        self.pos_drop1 = Dropout(p=drop_rate)

        self.patch_embed2 = PatchEmbed(img_size=img_size // (2 ** (1 + 1)),
                                       patch_size=2,
                                       in_chans=embed_dims[1 - 1],
                                       embed_dim=embed_dims[1])
        num_patches = self.patch_embed2.num_patches
        self.pos_embed2 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[1]), mindspore.float16))
        self.pos_drop2 = Dropout(p=drop_rate)

        self.patch_embed3 = PatchEmbed(img_size=img_size // (2 ** (2 + 1)),
                                       patch_size=2,
                                       in_chans=embed_dims[2 - 1],
                                       embed_dim=embed_dims[2])
        num_patches = self.patch_embed3.num_patches
        self.pos_embed3 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[2]), mindspore.float16))
        self.pos_drop3 = Dropout(p=drop_rate)

        self.patch_embed4 = PatchEmbed(img_size // (2 ** (3 + 1)),
                                       patch_size=2,
                                       in_chans=embed_dims[3 - 1],
                                       embed_dim=embed_dims[3])
        num_patches = self.patch_embed4.num_patches + 1
        self.pos_embed4 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[3]), mindspore.float16))
        self.pos_drop4 = Dropout(p=drop_rate)
        self.Blocks = nn.CellList(b_list)

        self.norm = norm_layer([embed_dims[3]])

        # cls_token
        self.cls_token = mindspore.Parameter(ops.zeros((1, 1, embed_dims[3]), mindspore.float32))

        # classification head
        self.head = nn.Dense(embed_dims[3], num_classes) if num_classes > 0 else Identity()
        self.reshape = ops.reshape
        self.transpose = ops.transpose
        self.tile = ops.Tile()
        self.Concat = ops.Concat(axis=1)
        self._initialize_weights()

    def _initialize_weights(self):
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(weight_init.initializer(weight_init.TruncatedNormal(sigma=0.02),
                                                             cell.weight.shape, cell.weight.dtype))
                if isinstance(cell, nn.Dense) and cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(weight_init.initializer(weight_init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(weight_init.initializer(weight_init.Zero(), cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Conv2d):
                fan_out = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                fan_out //= cell.group
                cell.weight.set_data(weight_init.initializer(weight_init.Normal(sigma=math.sqrt(2.0 / fan_out)),
                                                             cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=""):
        self.num_classes = num_classes
        self.head = nn.Dense(self.embed_dim, num_classes) if num_classes > 0 else Identity()

    def _get_pos_embed(self, pos_embed, ph, pw, H, W):
        if H * W == self.patch_embed1.num_patches:
            return pos_embed
        else:
            pos_embed = self.transpose(self.reshape(pos_embed, (1, ph, pw, -1)), (0, 3, 1, 2))
            resize_bilinear = ResizeBilinear((H, W))
            pos_embed = resize_bilinear(pos_embed)

            pos_embed = self.transpose(self.reshape(pos_embed, (1, -1, H * W)), (0, 2, 1))

            return pos_embed

    def forward_features(self, x):
        B = x.shape[0]

        x, (H, W) = self.patch_embed1(x)
        pos_embed = self.pos_embed1
        x = self.pos_drop1(x + pos_embed)
        for blk in self.Blocks[0]:
            x = blk(x, H, W)
        x = self.transpose(self.reshape(x, (B, H, W, -1)), (0, 3, 1, 2))

        x, (H, W) = self.patch_embed2(x)
        ph, pw = self.patch_embed2.H, self.patch_embed2.W
        pos_embed = self._get_pos_embed(self.pos_embed2, ph, pw, H, W)
        x = self.pos_drop2(x + pos_embed)
        for blk in self.Blocks[1]:
            x = blk(x, H, W)
        x = self.transpose(self.reshape(x, (B, H, W, -1)), (0, 3, 1, 2))

        x, (H, W) = self.patch_embed3(x)
        ph, pw = self.patch_embed3.H, self.patch_embed3.W
        pos_embed = self._get_pos_embed(self.pos_embed3, ph, pw, H, W)
        x = self.pos_drop3(x + pos_embed)
        for blk in self.Blocks[2]:
            x = blk(x, H, W)
        x = self.transpose(self.reshape(x, (B, H, W, -1)), (0, 3, 1, 2))

        x, (H, W) = self.patch_embed4(x)
        cls_tokens = self.tile(self.cls_token, (B, 1, 1))

        x = self.Concat((cls_tokens, x))
        ph, pw = self.patch_embed4.H, self.patch_embed4.W
        pos_embed_ = self._get_pos_embed(self.pos_embed4[:, 1:], ph, pw, H, W)
        pos_embed = self.Concat((self.pos_embed4[:, 0:1], pos_embed_))
        x = self.pos_drop4(x + pos_embed)
        for blk in self.Blocks[3]:
            x = blk(x, H, W)

        x = self.norm(x)

        return x[:, 0]

    def forward_head(self, x: Tensor) -> Tensor:
        return self.head(x)

    def construct(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)

        return x

mindcv.models.pvt.pvt_large(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVT large model Refer to the base class "models.PVT" for more details.

Source code in mindcv\models\pvt.py
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
@register_model
def pvt_large(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformer:
    """Get PVT large model
    Refer to the base class "models.PVT" for more details.
    """
    default_cfg = default_cfgs['pvt_large']
    model = PyramidVisionTransformer(in_chans=in_channels, num_classes=num_classes,
                                     patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8],
                                     mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
                                     norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 8, 27, 3],
                                     sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvt.pvt_medium(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVT medium model Refer to the base class "models.PVT" for more details.

Source code in mindcv\models\pvt.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
@register_model
def pvt_medium(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformer:
    """Get PVT medium model
    Refer to the base class "models.PVT" for more details.
    """
    default_cfg = default_cfgs['pvt_medium']
    model = PyramidVisionTransformer(in_chans=in_channels, num_classes=num_classes,
                                     patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8],
                                     mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
                                     norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 4, 18, 3],
                                     sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvt.pvt_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVT small model Refer to the base class "models.PVT" for more details.

Source code in mindcv\models\pvt.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
@register_model
def pvt_small(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformer:
    """Get PVT small model
    Refer to the base class "models.PVT" for more details.
    """
    default_cfg = default_cfgs['pvt_small']
    model = PyramidVisionTransformer(in_chans=in_channels, num_classes=num_classes,
                                     patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8],
                                     mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
                                     norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 4, 6, 3],
                                     sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvt.pvt_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVT tiny model Refer to the base class "models.PVT" for more details.

Source code in mindcv\models\pvt.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
@register_model
def pvt_tiny(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformer:
    """Get PVT tiny model
    Refer to the base class "models.PVT" for more details.
    """
    default_cfg = default_cfgs['pvt_tiny']
    model = PyramidVisionTransformer(in_chans=in_channels, num_classes=num_classes,
                                     patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8],
                                     mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
                                     norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[2, 2, 2, 2],
                                     sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

pvtv2

mindcv.models.pvtv2

MindSpore implementation of PVTv2. Refer to PVTv2: PVTv2: Improved Baselines with Pyramid Vision Transformer

mindcv.models.pvtv2.Attention

Bases: Cell

Linear Spatial Reduction Attention

Source code in mindcv\models\pvtv2.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Attention(nn.Cell):
    """Linear Spatial Reduction Attention"""

    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1,
                 linear=False):
        super().__init__()
        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."

        self.dim = dim
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.q = nn.Dense(dim, dim, has_bias=qkv_bias)
        self.kv = nn.Dense(dim, dim * 2, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.qk_batmatmul = ops.BatchMatMul(transpose_b=True)
        self.batmatmul = ops.BatchMatMul()
        self.softmax = nn.Softmax(axis=-1)

        self.linear = linear
        self.sr_ratio = sr_ratio
        if not linear:
            if sr_ratio > 1:
                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio, has_bias=True)
                self.norm = nn.LayerNorm([dim])

        else:
            self.pool = nn.AdaptiveAvgPool2d(7)
            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1, has_bias=True)
            self.norm = nn.LayerNorm([dim])
            self.act = nn.GELU()

    def construct(self, x, H, W):
        B, N, C = x.shape
        q = self.q(x)
        q = ops.reshape(q, (B, N, self.num_heads, C // self.num_heads))
        q = ops.transpose(q, (0, 2, 1, 3))

        if not self.linear:
            if self.sr_ratio > 1:
                x_ = ops.reshape(ops.transpose(x, (0, 2, 1)), (B, C, H, W))

                x_ = self.sr(x_)
                x_ = ops.transpose(ops.reshape(x_, (B, C, -1)), (0, 2, 1))
                x_ = self.norm(x_)

                kv = self.kv(x_)
                kv = ops.transpose(ops.reshape(kv, (B, -1, 2, self.num_heads, C // self.num_heads)), (2, 0, 3, 1, 4))
            else:
                kv = self.kv(x)
                kv = ops.transpose(ops.reshape(kv, (B, -1, 2, self.num_heads, C // self.num_heads)), (2, 0, 3, 1, 4))

        else:
            x_ = ops.reshape(ops.transpose(x, (0, 2, 1)), (B, C, H, W))
            x_ = self.sr(self.pool(x_))
            x_ = ops.reshape(ops.transpose(x_, (0, 2, 1)), (B, C, -1))
            x_ = self.norm(x_)
            x_ = self.act(x_)
            kv = ops.transpose(ops.reshape(self.kv(x_), (B, -1, 2, self.num_heads, C // self.num_heads)),
                               (2, 0, 3, 1, 4))
        k, v = kv[0], kv[1]

        attn = self.qk_batmatmul(q, k) * self.scale
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        x = self.batmatmul(attn, v)
        x = ops.reshape(ops.transpose(x, (0, 2, 1, 3)), (B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)

        return x

mindcv.models.pvtv2.Block

Bases: Cell

Block with Linear Spatial Reduction Attention and Convolutional Feed-Forward

Source code in mindcv\models\pvtv2.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class Block(nn.Cell):
    """Block with Linear Spatial Reduction Attention and Convolutional Feed-Forward"""

    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False, block_id=0):
        super().__init__()
        self.norm1 = norm_layer([dim])

        self.attn = Attention(
            dim,
            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)

        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()

        self.norm2 = norm_layer([dim])

        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)

    def construct(self, x, H, W):
        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))

        return x

mindcv.models.pvtv2.DWConv

Bases: Cell

Depthwise separable convolution

Source code in mindcv\models\pvtv2.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class DWConv(nn.Cell):
    """Depthwise separable convolution"""

    def __init__(self, dim=768):
        super(DWConv, self).__init__()
        self.dwconv = nn.Conv2d(dim, dim, 3, 1, has_bias=True, group=dim)

    def construct(self, x, H, W):
        B, N, C = x.shape
        x = ops.transpose(x, (0, 2, 1)).view((B, C, H, W))
        x = self.dwconv(x)
        x = ops.transpose(x.view((B, C, H * W)), (0, 2, 1))

        return x

mindcv.models.pvtv2.Mlp

Bases: Cell

MLP with depthwise separable convolution

Source code in mindcv\models\pvtv2.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class Mlp(nn.Cell):
    """MLP with depthwise separable convolution"""

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0, linear=False):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Dense(in_features, hidden_features)
        self.dwconv = DWConv(hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Dense(hidden_features, out_features)
        self.drop = Dropout(p=drop)
        self.linear = linear
        if self.linear:
            self.relu = nn.ReLU()

    def construct(self, x, H, W):
        x = self.fc1(x)
        if self.linear:
            x = self.relu(x)
        x = self.dwconv(x, H, W)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

mindcv.models.pvtv2.OverlapPatchEmbed

Bases: Cell

Overlapping Patch Embedding

Source code in mindcv\models\pvtv2.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class OverlapPatchEmbed(nn.Cell):
    """Overlapping Patch Embedding"""

    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
        super().__init__()

        img_size = (img_size, img_size)
        patch_size = (patch_size, patch_size)

        assert max(patch_size) > stride, "Set larger patch_size than stride"

        self.img_size = img_size
        self.patch_size = patch_size
        self.H, self.W = img_size[0] // stride, img_size[1] // stride
        self.num_patches = self.H * self.W
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, has_bias=True)
        self.norm = nn.LayerNorm([embed_dim])

    def construct(self, x):
        x = self.proj(x)
        B, C, H, W = x.shape
        x = ops.transpose(ops.reshape(x, (B, C, H * W)), (0, 2, 1))
        x = self.norm(x)

        return x, H, W

mindcv.models.pvtv2.PyramidVisionTransformerV2

Bases: Cell

Pyramid Vision Transformer V2 model class, based on "PVTv2: Improved Baselines with Pyramid Vision Transformer" <https://arxiv.org/abs/2106.13797>_

PARAMETER DESCRIPTION
img_size(int)

size of a input image.

patch_size

size of a single image patch.

TYPE: int) DEFAULT: 16

in_chans

number the channels of the input. Default: 3.

TYPE: int) DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

embed_dims

how many hidden dim in each PatchEmbed.

TYPE: list) DEFAULT: [64, 128, 256, 512]

num_heads

number of attention head in each stage.

TYPE: list) DEFAULT: [1, 2, 4, 8]

mlp_ratios

ratios of MLP hidden dims in each stage.

TYPE: list DEFAULT: [4, 4, 4, 4]

qkv_bias(bool)

use bias in attention.

qk_scale(float)

Scale multiplied by qk in attention(if not none), otherwise head_dim ** -0.5.

drop_rate(float)

The drop rate for each block. Default: 0.0.

attn_drop_rate(float)

The drop rate for attention. Default: 0.0.

drop_path_rate(float)

The drop rate for drop path. Default: 0.0.

norm_layer(nn.Cell)

Norm layer that will be used in blocks. Default: nn.LayerNorm.

depths

number of Blocks.

TYPE: list) DEFAULT: [3, 4, 6, 3]

sr_ratios(list)

stride and kernel size of each attention.

num_stages(int)

number of stage. Default: 4.

linear(bool)

use linear SRA.

Source code in mindcv\models\pvtv2.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
class PyramidVisionTransformerV2(nn.Cell):
    r"""Pyramid Vision Transformer V2 model class, based on
    `"PVTv2: Improved Baselines with Pyramid Vision Transformer" <https://arxiv.org/abs/2106.13797>`_

    Args:
        img_size(int) : size of a input image.
        patch_size (int) : size of a single image patch.
        in_chans (int) : number the channels of the input. Default: 3.
        num_classes (int) : number of classification classes. Default: 1000.
        embed_dims (list) : how many hidden dim in each PatchEmbed.
        num_heads (list) : number of attention head in each stage.
        mlp_ratios (list): ratios of MLP hidden dims in each stage.
        qkv_bias(bool) : use bias in attention.
        qk_scale(float) : Scale multiplied by qk in attention(if not none), otherwise head_dim ** -0.5.
        drop_rate(float) : The drop rate for each block. Default: 0.0.
        attn_drop_rate(float) : The drop rate for attention. Default: 0.0.
        drop_path_rate(float) : The drop rate for drop path. Default: 0.0.
        norm_layer(nn.Cell) : Norm layer that will be used in blocks. Default: nn.LayerNorm.
        depths (list) : number of Blocks.
        sr_ratios(list) : stride and kernel size of each attention.
        num_stages(int) : number of stage. Default: 4.
        linear(bool) :  use linear SRA.
    """

    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], num_stages=4, linear=False):
        super().__init__()
        self.num_classes = num_classes
        self.depths = depths
        self.num_stages = num_stages

        dpr = [x.item() for x in np.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
        cur = 0

        patch_embed_list = []
        block_list = []
        norm_list = []

        for i in range(num_stages):
            patch_embed = OverlapPatchEmbed(img_size=img_size if i == 0 else img_size // (2 ** (i + 1)),
                                            patch_size=7 if i == 0 else 3,
                                            stride=4 if i == 0 else 2,
                                            in_chans=in_chans if i == 0 else embed_dims[i - 1],
                                            embed_dim=embed_dims[i])

            block = nn.CellList([Block(
                dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
                sr_ratio=sr_ratios[i], linear=linear, block_id=j)
                for j in range(depths[i])])

            norm = norm_layer([embed_dims[i]])

            cur += depths[i]

            patch_embed_list.append(patch_embed)
            block_list.append(block)
            norm_list.append(norm)
        self.patch_embed_list = nn.CellList(patch_embed_list)
        self.block_list = nn.CellList(block_list)
        self.norm_list = nn.CellList(norm_list)
        # classification head
        self.head = nn.Dense(embed_dims[3], num_classes) if num_classes > 0 else Identity()
        self._initialize_weights()

    def freeze_patch_emb(self):
        self.patch_embed_list[0].requires_grad = False

    def _initialize_weights(self):
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(weight_init.initializer(weight_init.TruncatedNormal(sigma=0.02),
                                                             cell.weight.shape, cell.weight.dtype))
                if isinstance(cell, nn.Dense) and cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(weight_init.initializer(weight_init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(weight_init.initializer(weight_init.Zero(), cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Conv2d):
                fan_out = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                fan_out //= cell.group
                cell.weight.set_data(weight_init.initializer(weight_init.Normal(sigma=math.sqrt(2.0 / fan_out)),
                                                             cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(weight_init.initializer(weight_init.Zero(), cell.bias.shape, cell.bias.dtype))

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=""):
        self.num_classes = num_classes
        self.head = nn.Dense(self.embed_dim, num_classes) if num_classes > 0 else Identity()

    def forward_features(self, x):
        B = x.shape[0]

        for i in range(self.num_stages):
            patch_embed = self.patch_embed_list[i]
            block = self.block_list[i]
            norm = self.norm_list[i]
            x, H, W = patch_embed(x)
            for blk in block:
                x = blk(x, H, W)
            x = norm(x)
            if i != self.num_stages - 1:
                x = ops.transpose(ops.reshape(x, (B, H, W, -1)), (0, 3, 1, 2))

        return x.mean(axis=1)

    def forward_head(self, x: Tensor) -> Tensor:
        return self.head(x)

    def construct(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)

        return x

mindcv.models.pvtv2.pvt_v2_b0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b0 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
@register_model
def pvt_v2_b0(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b0 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b0"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvtv2.pvt_v2_b1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b1 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
@register_model
def pvt_v2_b1(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b1 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b1"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvtv2.pvt_v2_b2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b2 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
@register_model
def pvt_v2_b2(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b2 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b2"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvtv2.pvt_v2_b3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b3 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
@register_model
def pvt_v2_b3(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b3 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b3"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvtv2.pvt_v2_b4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b4 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
@register_model
def pvt_v2_b4(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b4 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b4"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.pvtv2.pvt_v2_b5(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get PVTV2-b5 model Refer to the base class "models.PVTv2" for more details.

Source code in mindcv\models\pvtv2.py
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
@register_model
def pvt_v2_b5(
    pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs
) -> PyramidVisionTransformerV2:
    """Get PVTV2-b5 model
    Refer to the base class "models.PVTv2" for more details.
    """
    default_cfg = default_cfgs["pvt_v2_b5"]
    model = PyramidVisionTransformerV2(
        in_chans=in_channels, num_classes=num_classes,
        patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1], **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

regnet

mindcv.models.regnet

MindSpore implementation of RegNet. Refer to: Designing Network Design Spaces

mindcv.models.regnet.AnyHead

Bases: Cell

AnyNet head: optional conv, AvgPool, 1x1.

Source code in mindcv\models\regnet.py
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
class AnyHead(nn.Cell):
    """AnyNet head: optional conv, AvgPool, 1x1."""

    def __init__(self, w_in, head_width, num_classes):
        super(AnyHead, self).__init__()
        self.head_width = head_width
        if head_width > 0:
            self.conv = conv2d(w_in, head_width, 1)
            self.bn = norm2d(head_width)
            self.af = activation()
            w_in = head_width
        self.avg_pool = gap2d()
        self.fc = linear(w_in, num_classes, bias=True)

    def construct(self, x):
        x = self.af(self.bn(self.conv(x))) if self.head_width > 0 else x
        x = self.avg_pool(x)
        x = self.fc(x)
        return x

mindcv.models.regnet.AnyNet

Bases: Cell

AnyNet model.

Source code in mindcv\models\regnet.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
class AnyNet(nn.Cell):
    """AnyNet model."""

    @staticmethod
    def anynet_get_params(depths, stem_type, stem_w, block_type, widths, strides, bot_muls, group_ws, head_w,
                          num_classes, se_r):
        nones = [None for _ in depths]
        return {
            "stem_type": stem_type,
            "stem_w": stem_w,
            "block_type": block_type,
            "depths": depths,
            "widths": widths,
            "strides": strides,
            "bot_muls": bot_muls if bot_muls else nones,
            "group_ws": group_ws if group_ws else nones,
            "head_w": head_w,
            "se_r": se_r,
            "num_classes": num_classes,
        }

    def __init__(self, depths, stem_type, stem_w, block_type, widths, strides, bot_muls, group_ws, head_w, num_classes,
                 se_r, in_channels):
        super(AnyNet, self).__init__()
        p = AnyNet.anynet_get_params(depths, stem_type, stem_w, block_type, widths, strides, bot_muls, group_ws, head_w,
                                     num_classes, se_r)
        stem_fun = get_stem_fun(p["stem_type"])
        block_fun = get_block_fun(p["block_type"])
        self.stem = stem_fun(in_channels, p["stem_w"])
        prev_w = p["stem_w"]
        keys = ["depths", "widths", "strides", "bot_muls", "group_ws"]
        self.stages = nn.CellList()
        for i, (d, w, s, b, g) in enumerate(zip(*[p[k] for k in keys])):
            params = {"bot_mul": b, "group_w": g, "se_r": p["se_r"]}
            stage = AnyStage(prev_w, w, s, d, block_fun, params)
            self.stages.append(stage)
            prev_w = w
        self.head = AnyHead(prev_w, p["head_w"], p["num_classes"])
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                fan_out = cell.kernel_size[0] * cell.kernel_size[1] * cell.out_channels
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=math.sqrt(2.0 / fan_out), mean=0.0),
                                     cell.weight.shape, cell.weight.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(sigma=0.01, mean=0.0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x):
        x = self.stem(x)
        for module in self.stages:
            x = module(x)
        return x

    def forward_head(self, x):
        x = self.head(x)
        return x

    def construct(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.regnet.AnyStage

Bases: Cell

AnyNet stage (sequence of blocks w/ the same output shape).

Source code in mindcv\models\regnet.py
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
class AnyStage(nn.Cell):
    """AnyNet stage (sequence of blocks w/ the same output shape)."""

    def __init__(self, w_in, w_out, stride, d, block_fun, params):
        super(AnyStage, self).__init__()
        self.blocks = nn.CellList()
        for _ in range(d):
            block = block_fun(w_in, w_out, stride, params)
            self.blocks.append(block)
            stride, w_in = 1, w_out

    def construct(self, x):
        for block in self.blocks:
            x = block(x)
        return x

mindcv.models.regnet.BasicTransform

Bases: Cell

Basic transformation: [3x3 conv, BN, Relu] x2.

Source code in mindcv\models\regnet.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
class BasicTransform(nn.Cell):
    """Basic transformation: [3x3 conv, BN, Relu] x2."""

    def __init__(self, w_in, w_out, stride, _params):
        super(BasicTransform, self).__init__()
        self.a = conv2d(w_in, w_out, 3, stride=stride)
        self.a_bn = norm2d(w_out)
        self.a_af = activation()
        self.b = conv2d(w_out, w_out, 3)
        self.b_bn = norm2d(w_out)
        self.b_bn.final_bn = True

    def construct(self, x):
        x = self.a(x)
        x = self.a_bn(x)
        x = self.a_af(x)
        x = self.b(x)
        x = self.b_bn(x)
        return x

mindcv.models.regnet.BottleneckTransform

Bases: Cell

Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.

Source code in mindcv\models\regnet.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
class BottleneckTransform(nn.Cell):
    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""

    def __init__(self, w_in, w_out, stride, params):
        super(BottleneckTransform, self).__init__()
        w_b = int(round(w_out * params["bot_mul"]))
        w_se = int(round(w_in * params["se_r"]))
        groups = w_b // params["group_w"]
        self.a = conv2d(w_in, w_b, 1)
        self.a_bn = norm2d(w_b)
        self.a_af = activation()
        self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups)
        self.b_bn = norm2d(w_b)
        self.b_af = activation()
        self.se = SqueezeExcite(in_channels=w_b, rd_channels=w_se) if w_se else None
        self.c = conv2d(w_b, w_out, 1)
        self.c_bn = norm2d(w_out)
        self.c_bn.final_bn = True

    def construct(self, x):
        x = self.a(x)
        x = self.a_bn(x)
        x = self.a_af(x)
        x = self.b(x)
        x = self.b_bn(x)
        x = self.b_af(x)
        x = self.se(x) if self.se is not None else x
        x = self.c(x)
        x = self.c_bn(x)
        return x

mindcv.models.regnet.RegNet

Bases: AnyNet

RegNet model class, based on "Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>_

Source code in mindcv\models\regnet.py
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
class RegNet(AnyNet):
    r"""RegNet model class, based on
    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_
    """

    @staticmethod
    def regnet_get_params(w_a, w_0, w_m, d, stride, bot_mul, group_w, stem_type, stem_w, block_type, head_w,
                          num_classes, se_r):
        """Get AnyNet parameters that correspond to the RegNet."""
        ws, ds, ss, bs, gs = generate_regnet_full(w_a, w_0, w_m, d, stride, bot_mul, group_w)
        return {
            "stem_type": stem_type,
            "stem_w": stem_w,
            "block_type": block_type,
            "depths": ds,
            "widths": ws,
            "strides": ss,
            "bot_muls": bs,
            "group_ws": gs,
            "head_w": head_w,
            "se_r": se_r,
            "num_classes": num_classes,
        }

    def __init__(self, w_a, w_0, w_m, d, group_w, stride=2, bot_mul=1.0, stem_type="simple_stem_in", stem_w=32,
                 block_type="res_bottleneck_block", head_w=0, num_classes=1000, se_r=0.0, in_channels=3):
        params = RegNet.regnet_get_params(w_a, w_0, w_m, d, stride, bot_mul, group_w, stem_type, stem_w, block_type,
                                          head_w, num_classes, se_r)
        super(RegNet, self).__init__(params["depths"], params["stem_type"], params["stem_w"], params["block_type"],
                                     params["widths"], params["strides"], params["bot_muls"], params["group_ws"],
                                     params["head_w"], params["num_classes"], params["se_r"], in_channels)
mindcv.models.regnet.RegNet.regnet_get_params(w_a, w_0, w_m, d, stride, bot_mul, group_w, stem_type, stem_w, block_type, head_w, num_classes, se_r) staticmethod

Get AnyNet parameters that correspond to the RegNet.

Source code in mindcv\models\regnet.py
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
@staticmethod
def regnet_get_params(w_a, w_0, w_m, d, stride, bot_mul, group_w, stem_type, stem_w, block_type, head_w,
                      num_classes, se_r):
    """Get AnyNet parameters that correspond to the RegNet."""
    ws, ds, ss, bs, gs = generate_regnet_full(w_a, w_0, w_m, d, stride, bot_mul, group_w)
    return {
        "stem_type": stem_type,
        "stem_w": stem_w,
        "block_type": block_type,
        "depths": ds,
        "widths": ws,
        "strides": ss,
        "bot_muls": bs,
        "group_ws": gs,
        "head_w": head_w,
        "se_r": se_r,
        "num_classes": num_classes,
    }

mindcv.models.regnet.ResBasicBlock

Bases: Cell

Residual basic block: x + f(x), f = basic transform.

Source code in mindcv\models\regnet.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class ResBasicBlock(nn.Cell):
    """Residual basic block: x + f(x), f = basic transform."""

    def __init__(self, w_in, w_out, stride, params):
        super(ResBasicBlock, self).__init__()
        self.proj, self.bn = None, None
        if (w_in != w_out) or (stride != 1):
            self.proj = conv2d(w_in, w_out, 1, stride=stride)
            self.bn = norm2d(w_out)
        self.f = BasicTransform(w_in, w_out, stride, params)
        self.af = activation()

    def construct(self, x):
        x_p = self.bn(self.proj(x)) if self.proj is not None else x
        return self.af(x_p + self.f(x))

mindcv.models.regnet.ResBottleneckBlock

Bases: Cell

Residual bottleneck block: x + f(x), f = bottleneck transform.

Source code in mindcv\models\regnet.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class ResBottleneckBlock(nn.Cell):
    """Residual bottleneck block: x + f(x), f = bottleneck transform."""

    def __init__(self, w_in, w_out, stride, params):
        super(ResBottleneckBlock, self).__init__()
        self.proj, self.bn = None, None
        if (w_in != w_out) or (stride != 1):
            self.proj = conv2d(w_in, w_out, 1, stride=stride)
            self.bn = norm2d(w_out)
        self.f = BottleneckTransform(w_in, w_out, stride, params)
        self.af = activation()

    def construct(self, x):
        x_p = self.bn(self.proj(x)) if self.proj is not None else x
        return self.af(x_p + self.f(x))

mindcv.models.regnet.ResBottleneckLinearBlock

Bases: Cell

Residual linear bottleneck block: x + f(x), f = bottleneck transform.

Source code in mindcv\models\regnet.py
279
280
281
282
283
284
285
286
287
288
class ResBottleneckLinearBlock(nn.Cell):
    """Residual linear bottleneck block: x + f(x), f = bottleneck transform."""

    def __init__(self, w_in, w_out, stride, params):
        super(ResBottleneckLinearBlock, self).__init__()
        self.has_skip = (w_in == w_out) and (stride == 1)
        self.f = BottleneckTransform(w_in, w_out, stride, params)

    def construct(self, x):
        return x + self.f(x) if self.has_skip else self.f(x)

mindcv.models.regnet.ResStem

Bases: Cell

ResNet stem for ImageNet: 7x7, BN, AF, MaxPool.

Source code in mindcv\models\regnet.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class ResStem(nn.Cell):
    """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool."""

    def __init__(self, w_in, w_out):
        super(ResStem, self).__init__()
        self.conv = conv2d(w_in, w_out, 7, stride=2)
        self.bn = norm2d(w_out)
        self.af = activation()
        self.pool = pool2d(w_out, 3, stride=2)

    def construct(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.af(x)
        x = self.pool(x)
        return x

mindcv.models.regnet.ResStemCifar

Bases: Cell

ResNet stem for CIFAR: 3x3, BN, AF.

Source code in mindcv\models\regnet.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
class ResStemCifar(nn.Cell):
    """ResNet stem for CIFAR: 3x3, BN, AF."""

    def __init__(self, w_in, w_out):
        super(ResStemCifar, self).__init__()
        self.conv = conv2d(w_in, w_out, 3)
        self.bn = norm2d(w_out)
        self.af = activation()

    def construct(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.af(x)
        return x

mindcv.models.regnet.SimpleStem

Bases: Cell

Simple stem for ImageNet: 3x3, BN, AF.

Source code in mindcv\models\regnet.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class SimpleStem(nn.Cell):
    """Simple stem for ImageNet: 3x3, BN, AF."""

    def __init__(self, w_in, w_out):
        super(SimpleStem, self).__init__()
        self.conv = conv2d(w_in, w_out, 3, stride=2)
        self.bn = norm2d(w_out)
        self.af = activation()

    def construct(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.af(x)
        return x

mindcv.models.regnet.VanillaBlock

Bases: Cell

Vanilla block: [3x3 conv, BN, Relu] x2.

Source code in mindcv\models\regnet.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
class VanillaBlock(nn.Cell):
    """Vanilla block: [3x3 conv, BN, Relu] x2."""

    def __init__(self, w_in, w_out, stride, _params):
        super(VanillaBlock, self).__init__()
        self.a = conv2d(w_in, w_out, 3, stride=stride)
        self.a_bn = norm2d(w_out)
        self.a_af = activation()
        self.b = conv2d(w_out, w_out, 3)
        self.b_bn = norm2d(w_out)
        self.b_af = activation()

    def construct(self, x):
        x = self.a(x)
        x = self.a_bn(x)
        x = self.a_af(x)
        x = self.b(x)
        x = self.b_bn(x)
        x = self.b_af(x)
        return x

mindcv.models.regnet.activation()

Helper for building an activation layer.

Source code in mindcv\models\regnet.py
115
116
117
def activation():
    """Helper for building an activation layer."""
    return nn.ReLU()

mindcv.models.regnet.adjust_block_compatibility(ws, bs, gs)

Adjusts the compatibility of widths, bottlenecks, and groups.

Source code in mindcv\models\regnet.py
427
428
429
430
431
432
433
434
435
436
437
438
def adjust_block_compatibility(ws, bs, gs):
    """Adjusts the compatibility of widths, bottlenecks, and groups."""
    assert len(ws) == len(bs) == len(gs)
    assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs))
    assert all(b < 1 or b % 1 == 0 for b in bs)
    vs = [int(max(1, w * b)) for w, b in zip(ws, bs)]
    gs = [int(min(g, v)) for g, v in zip(gs, vs)]
    ms = [np.lcm(g, int(b)) if b > 1 else g for g, b in zip(gs, bs)]
    vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)]
    ws = [int(v / b) for v, b in zip(vs, bs)]
    assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs))
    return ws, bs, gs

mindcv.models.regnet.conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False)

Helper for building a conv2d layer.

Source code in mindcv\models\regnet.py
84
85
86
87
88
def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False):
    """Helper for building a conv2d layer."""
    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
    s, p, g, b = stride, (k - 1) // 2, groups, bias
    return nn.Conv2d(w_in, w_out, k, stride=s, pad_mode="pad", padding=p, group=g, has_bias=b)

mindcv.models.regnet.gap2d(keep_dims=False)

Helper for building a gap2d layer.

Source code in mindcv\models\regnet.py
105
106
107
def gap2d(keep_dims=False):
    """Helper for building a gap2d layer."""
    return GlobalAvgPooling(keep_dims)

mindcv.models.regnet.generate_regnet(w_a, w_0, w_m, d, q=8)

Generates per stage widths and depths from RegNet parameters.

Source code in mindcv\models\regnet.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
def generate_regnet(w_a, w_0, w_m, d, q=8):
    """Generates per stage widths and depths from RegNet parameters."""
    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
    # Generate continuous per-block ws
    ws_cont = np.arange(d) * w_a + w_0
    # Generate quantized per-block ws
    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
    ws_all = w_0 * np.power(w_m, ks)
    ws_all = np.round(np.divide(ws_all, q)).astype(int) * q
    # Generate per stage ws and ds (assumes ws_all are sorted)
    ws, ds = np.unique(ws_all, return_counts=True)
    # Compute number of actual stages and total possible stages
    num_stages, total_stages = len(ws), ks.max() + 1
    # Convert numpy arrays to lists and return
    ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont))
    return ws, ds, num_stages, total_stages, ws_all, ws_cont

mindcv.models.regnet.generate_regnet_full(w_a, w_0, w_m, d, stride, bot_mul, group_w)

Generates per stage ws, ds, gs, bs, and ss from RegNet cfg.

Source code in mindcv\models\regnet.py
459
460
461
462
463
464
465
466
def generate_regnet_full(w_a, w_0, w_m, d, stride, bot_mul, group_w):
    """Generates per stage ws, ds, gs, bs, and ss from RegNet cfg."""
    ws, ds = generate_regnet(w_a, w_0, w_m, d)[0:2]
    ss = [stride for _ in ws]
    bs = [bot_mul for _ in ws]
    gs = [group_w for _ in ws]
    ws, bs, gs = adjust_block_compatibility(ws, bs, gs)
    return ws, ds, ss, bs, gs

mindcv.models.regnet.get_block_fun(block_type)

Retrieves the block function by name.

Source code in mindcv\models\regnet.py
341
342
343
344
345
346
347
348
349
350
351
def get_block_fun(block_type):
    """Retrieves the block function by name."""
    block_funs = {
        "vanilla_block": VanillaBlock,
        "res_basic_block": ResBasicBlock,
        "res_bottleneck_block": ResBottleneckBlock,
        "res_bottleneck_linear_block": ResBottleneckLinearBlock,
    }
    err_str = "Block type '{}' not supported"
    assert block_type in block_funs.keys(), err_str.format(block_type)
    return block_funs[block_type]

mindcv.models.regnet.get_stem_fun(stem_type)

Retrieves the stem function by name.

Source code in mindcv\models\regnet.py
329
330
331
332
333
334
335
336
337
338
def get_stem_fun(stem_type):
    """Retrieves the stem function by name."""
    stem_funs = {
        "res_stem_cifar": ResStemCifar,
        "res_stem_in": ResStem,
        "simple_stem_in": SimpleStem,
    }
    err_str = "Stem type '{}' not supported"
    assert stem_type in stem_funs.keys(), err_str.format(stem_type)
    return stem_funs[stem_type]

mindcv.models.regnet.linear(w_in, w_out, *, bias=False)

Helper for building a linear layer.

Source code in mindcv\models\regnet.py
110
111
112
def linear(w_in, w_out, *, bias=False):
    """Helper for building a linear layer."""
    return nn.Dense(w_in, w_out, has_bias=bias)

mindcv.models.regnet.norm2d(w_in, eps=1e-05, mom=0.9)

Helper for building a norm2d layer.

Source code in mindcv\models\regnet.py
91
92
93
def norm2d(w_in, eps=1e-5, mom=0.9):
    """Helper for building a norm2d layer."""
    return nn.BatchNorm2d(num_features=w_in, eps=eps, momentum=mom)

mindcv.models.regnet.pool2d(_w_in, k, *, stride=1)

Helper for building a pool2d layer.

Source code in mindcv\models\regnet.py
 96
 97
 98
 99
100
101
102
def pool2d(_w_in, k, *, stride=1):
    """Helper for building a pool2d layer."""
    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
    padding = (k - 1) // 2
    pad2d = nn.Pad(((0, 0), (0, 0), (padding, padding), (padding, padding)), mode="CONSTANT")
    max_pool = nn.MaxPool2d(kernel_size=k, stride=stride, pad_mode="valid")
    return nn.SequentialCell([pad2d, max_pool])

repmlp

mindcv.models.repmlp

MindSpore implementation of RepMLPNet. Refer to RepMLPNet: Hierarchical Vision MLP with Re-parameterized Locality.

mindcv.models.repmlp.FFNBlock

Bases: Cell

Common FFN layer

Source code in mindcv\models\repmlp.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
class FFNBlock(nn.Cell):
    """Common FFN layer"""

    def __init__(self, in_channels, hidden_channels=None, out_channels=None, act_layer=nn.GELU):
        super().__init__()
        out_features = out_channels or in_channels
        hidden_features = hidden_channels or in_channels
        self.ffn_fc1 = conv_bn(in_channels, hidden_features, 1, 1, 0, has_bias=False)
        self.ffn_fc2 = conv_bn(hidden_features, out_features, 1, 1, 0, has_bias=False)
        self.act = act_layer()

    def construct(self, inputs):
        x = self.ffn_fc1(inputs)
        x = self.act(x)
        x = self.ffn_fc2(x)
        return x

mindcv.models.repmlp.GlobalPerceptron

Bases: Cell

GlobalPerceptron Layers provides global information(One of the three components of RepMLPBlock)

Source code in mindcv\models\repmlp.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
class GlobalPerceptron(nn.Cell):
    """GlobalPerceptron Layers provides global information(One of the three components of RepMLPBlock)"""

    def __init__(self, input_channels, internal_neurons):
        super(GlobalPerceptron, self).__init__()
        self.fc1 = nn.Conv2d(in_channels=input_channels, out_channels=internal_neurons, kernel_size=(1, 1), stride=1,
                             has_bias=True)
        self.fc2 = nn.Conv2d(in_channels=internal_neurons, out_channels=input_channels, kernel_size=(1, 1), stride=1,
                             has_bias=True)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.input_channels = input_channels
        self.shape = ops.Shape()

    def construct(self, x):
        shape = self.shape(x)
        pool = nn.AvgPool2d(kernel_size=(shape[2], shape[3]), stride=1)
        x = pool(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        x = x.view(-1, self.input_channels, 1, 1)
        return x

mindcv.models.repmlp.RepMLPBlock

Bases: Cell

Basic RepMLPBlock Layer(compose of Global Perceptron, Channel Perceptron and Local Perceptron)

Source code in mindcv\models\repmlp.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class RepMLPBlock(nn.Cell):
    """Basic RepMLPBlock Layer(compose of Global Perceptron, Channel Perceptron and Local Perceptron)"""

    def __init__(self, in_channels, out_channels,
                 h, w,
                 reparam_conv_k=None,
                 globalperceptron_reduce=4,
                 num_sharesets=1,
                 deploy=False):
        super().__init__()

        self.C = in_channels  # noqa: E741
        self.O = out_channels  # noqa: E741
        self.S = num_sharesets  # noqa: E741

        self.h, self.w = h, w

        self.deploy = deploy
        self.transpose = ops.Transpose()
        self.shape = ops.Shape()
        self.reshape = ops.Reshape()

        assert in_channels == out_channels
        self.gp = GlobalPerceptron(input_channels=in_channels, internal_neurons=in_channels // globalperceptron_reduce)

        self.fc3 = nn.Conv2d(in_channels=self.h * self.w * num_sharesets, out_channels=self.h * self.w * num_sharesets,
                             kernel_size=(1, 1), stride=1, padding=0, has_bias=deploy, group=num_sharesets)
        if deploy:
            self.fc3_bn = ops.Identity()
        else:
            self.fc3_bn = nn.BatchNorm2d(num_sharesets).set_train()

        self.reparam_conv_k = reparam_conv_k
        self.conv_branch_k = []
        if not deploy and reparam_conv_k is not None:
            for k in reparam_conv_k:
                conv_branch = conv_bn(num_sharesets, num_sharesets, kernel_size=k, stride=1, padding=k // 2,
                                      group=num_sharesets, has_bias=False)
                self.__setattr__("repconv{}".format(k), conv_branch)
                self.conv_branch_k.append(conv_branch)
                # print(conv_branch)

    def partition(self, x, h_parts, w_parts):
        x = x.reshape(-1, self.C, h_parts, self.h, w_parts, self.w)
        input_perm = (0, 2, 4, 1, 3, 5)
        x = self.transpose(x, input_perm)
        return x

    def partition_affine(self, x, h_parts, w_parts):
        fc_inputs = x.reshape(-1, self.S * self.h * self.w, 1, 1)
        out = self.fc3(fc_inputs)
        out = out.reshape(-1, self.S, self.h, self.w)
        out = self.fc3_bn(out)
        out = out.reshape(-1, h_parts, w_parts, self.S, self.h, self.w)
        return out

    def construct(self, inputs):
        # Global Perceptron
        global_vec = self.gp(inputs)

        origin_shape = self.shape(inputs)

        h_parts = origin_shape[2] // self.h
        w_parts = origin_shape[3] // self.w

        partitions = self.partition(inputs, h_parts, w_parts)

        #   Channel Perceptron
        fc3_out = self.partition_affine(partitions, h_parts, w_parts)

        #   Local Perceptron
        if self.reparam_conv_k is not None and not self.deploy:
            conv_inputs = self.reshape(partitions, (-1, self.S, self.h, self.w))
            conv_out = 0
            for k in self.conv_branch_k:
                conv_out += k(conv_inputs)
            conv_out = self.reshape(conv_out, (-1, h_parts, w_parts, self.S, self.h, self.w))
            fc3_out += conv_out

        input_perm = (0, 3, 1, 4, 2, 5)
        fc3_out = self.transpose(fc3_out, input_perm)  # N, O, h_parts, out_h, w_parts, out_w
        out = fc3_out.reshape(*origin_shape)
        out = out * global_vec
        return out

    def get_equivalent_fc3(self):
        fc_weight, fc_bias = fuse_bn(self.fc3, self.fc3_bn)
        if self.reparam_conv_k is not None:
            largest_k = max(self.reparam_conv_k)
            largest_branch = self.__getattr__("repconv{}".format(largest_k))
            total_kernel, total_bias = fuse_bn(largest_branch.conv, largest_branch.bn)
            for k in self.reparam_conv_k:
                if k != largest_k:
                    k_branch = self.__getattr__("repconv{}".format(k))
                    kernel, bias = fuse_bn(k_branch.conv, k_branch.bn)
                    total_kernel += nn.Pad(kernel, [(largest_k - k) // 2] * 4)
                    total_bias += bias
            rep_weight, rep_bias = self._convert_conv_to_fc(total_kernel, total_bias)
            final_fc3_weight = rep_weight.reshape_as(fc_weight) + fc_weight
            final_fc3_bias = rep_bias + fc_bias
        else:
            final_fc3_weight = fc_weight
            final_fc3_bias = fc_bias
        return final_fc3_weight, final_fc3_bias

    def local_inject(self):
        self.deploy = True
        #   Locality Injection
        fc3_weight, fc3_bias = self.get_equivalent_fc3()
        #   Remove Local Perceptron
        if self.reparam_conv_k is not None:
            for k in self.reparam_conv_k:
                self.__delattr__("repconv{}".format(k))
        self.__delattr__("fc3")
        self.__delattr__("fc3_bn")
        self.fc3 = nn.Conv2d(self.S * self.h * self.w, self.S * self.h * self.w, 1, 1, 0, has_bias=True, group=self.S)
        self.fc3_bn = ops.Identity()
        self.fc3.weight.data = fc3_weight
        self.fc3.bias.data = fc3_bias

    def _convert_conv_to_fc(self, conv_kernel, conv_bias):
        I = ops.eye(self.h * self.w).repeat(1, self.S).reshape(self.h * self.w, self.S, self.h, self.w)  # noqa: E741
        fc_k = ops.Conv2D(I, conv_kernel, pad=(conv_kernel.size(2) // 2, conv_kernel.size(3) // 2), group=self.S)
        fc_k = fc_k.reshape(self.h * self.w, self.S * self.h * self.w).t()
        fc_bias = conv_bias.repeat_interleave(self.h * self.w)
        return fc_k, fc_bias

mindcv.models.repmlp.RepMLPNet

Bases: Cell

RepMLPNet model class, based on "RepMLPNet: Hierarchical Vision MLP with Re-parameterized Locality" <https://arxiv.org/pdf/2112.11081v2.pdf>_

PARAMETER DESCRIPTION
in_channels

number of input channels. Default: 3.

DEFAULT: 3

num_classes

number of classification classes. Default: 1000.

patch_size

size of a single image patch. Default: (4, 4)

DEFAULT: (4, 4)

num_blocks

number of blocks per stage. Default: (2,2,6,2)

DEFAULT: (2, 2, 6, 2)

channels

number of in_channels(channels[stage_idx]) and out_channels(channels[stage_idx + 1]) per stage. Default: (192,384,768,1536)

DEFAULT: (192, 384, 768, 1536)

hs

height of picture per stage. Default: (64,32,16,8)

DEFAULT: (64, 32, 16, 8)

ws

width of picture per stage. Default: (64,32,16,8)

DEFAULT: (64, 32, 16, 8)

sharesets_nums

number of share sets per stage. Default: (4,8,16,32)

DEFAULT: (4, 8, 16, 32)

reparam_conv_k

convolution kernel size in local Perceptron. Default: (3,)

DEFAULT: (3)

globalperceptron_reduce

Intermediate convolution output size (in_channal = inchannal, out_channel = in_channel/globalperceptron_reduce) in globalperceptron. Default: 4

DEFAULT: 4

use_checkpoint

whether to use checkpoint

DEFAULT: False

deploy

whether to use bias

DEFAULT: False

Source code in mindcv\models\repmlp.py
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
class RepMLPNet(nn.Cell):
    r"""RepMLPNet model class, based on
    `"RepMLPNet: Hierarchical Vision MLP with Re-parameterized Locality" <https://arxiv.org/pdf/2112.11081v2.pdf>`_

    Args:
        in_channels: number of input channels. Default: 3.
        num_classes: number of classification classes. Default: 1000.
        patch_size: size of a single image patch. Default: (4, 4)
        num_blocks: number of blocks per stage. Default: (2,2,6,2)
        channels: number of in_channels(channels[stage_idx]) and out_channels(channels[stage_idx + 1]) per stage.
            Default: (192,384,768,1536)
        hs: height of picture per stage. Default: (64,32,16,8)
        ws: width of picture per stage. Default: (64,32,16,8)
        sharesets_nums: number of share sets per stage. Default: (4,8,16,32)
        reparam_conv_k: convolution kernel size in local Perceptron. Default: (3,)
        globalperceptron_reduce: Intermediate convolution output size
            (in_channal = inchannal, out_channel = in_channel/globalperceptron_reduce) in globalperceptron. Default: 4
        use_checkpoint: whether to use checkpoint
        deploy: whether to use bias
    """

    def __init__(self,
                 in_channels=3, num_class=1000,
                 patch_size=(4, 4),
                 num_blocks=(2, 2, 6, 2), channels=(192, 384, 768, 1536),
                 hs=(64, 32, 16, 8), ws=(64, 32, 16, 8),
                 sharesets_nums=(4, 8, 16, 32),
                 reparam_conv_k=(3,),
                 globalperceptron_reduce=4, use_checkpoint=False,
                 deploy=False):
        super().__init__()
        num_stages = len(num_blocks)
        assert num_stages == len(channels)
        assert num_stages == len(hs)
        assert num_stages == len(ws)
        assert num_stages == len(sharesets_nums)

        self.conv_embedding = conv_bn_relu(in_channels, channels[0], kernel_size=patch_size, stride=patch_size,
                                           padding=0, has_bias=False)
        self.conv2d = nn.Conv2d(in_channels, channels[0], kernel_size=patch_size, stride=patch_size, padding=0)

        stages = []
        embeds = []
        for stage_idx in range(num_stages):
            stage_blocks = [RepMLPNetUnit(channels=channels[stage_idx], h=hs[stage_idx], w=ws[stage_idx],
                                          reparam_conv_k=reparam_conv_k,
                                          globalperceptron_reduce=globalperceptron_reduce, ffn_expand=4,
                                          num_sharesets=sharesets_nums[stage_idx],
                                          deploy=deploy) for _ in range(num_blocks[stage_idx])]
            stages.append(nn.CellList(stage_blocks))
            if stage_idx < num_stages - 1:
                embeds.append(
                    conv_bn_relu(in_channels=channels[stage_idx], out_channels=channels[stage_idx + 1], kernel_size=2,
                                 stride=2, padding=0))
        self.stages = nn.CellList(stages)
        self.embeds = nn.CellList(embeds)
        self.head_norm = nn.BatchNorm2d(channels[-1]).set_train()
        self.head = nn.Dense(channels[-1], num_class)

        self.use_checkpoint = use_checkpoint
        self.shape = ops.Shape()
        self.reshape = ops.Reshape()
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights for cells."""
        for name, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                k = cell.group / (cell.in_channels * cell.kernel_size[0] * cell.kernel_size[1])
                k = k ** 0.5
                cell.weight.set_data(init.initializer(init.Uniform(k), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Uniform(k), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.Dense):
                k = 1 / cell.in_channels
                k = k ** 0.5
                cell.weight.set_data(init.initializer(init.Uniform(k), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Uniform(k), cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.conv_embedding(x)

        for i, stage in enumerate(self.stages):
            for block in stage:
                x = block(x)

            if i < len(self.stages) - 1:
                embed = self.embeds[i]
                x = embed(x)
        x = self.head_norm(x)
        shape = self.shape(x)
        pool = nn.AvgPool2d(kernel_size=(shape[2], shape[3]))
        x = pool(x)
        return x.view(shape[0], -1)

    def forward_head(self, x: Tensor) -> Tensor:
        return self.head(x)

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        return self.forward_head(x)

mindcv.models.repmlp.RepMLPNetUnit

Bases: Cell

Basic unit of RepMLPNet

Source code in mindcv\models\repmlp.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
class RepMLPNetUnit(nn.Cell):
    """Basic unit of RepMLPNet"""

    def __init__(self, channels, h, w, reparam_conv_k, globalperceptron_reduce, ffn_expand=4,
                 num_sharesets=1, deploy=False):
        super().__init__()
        self.repmlp_block = RepMLPBlock(in_channels=channels, out_channels=channels, h=h, w=w,
                                        reparam_conv_k=reparam_conv_k, globalperceptron_reduce=globalperceptron_reduce,
                                        num_sharesets=num_sharesets, deploy=deploy)
        self.ffn_block = FFNBlock(channels, channels * ffn_expand)
        self.prebn1 = nn.BatchNorm2d(channels).set_train()
        self.prebn2 = nn.BatchNorm2d(channels).set_train()

    def construct(self, x):
        y = x + self.repmlp_block(self.prebn1(x))
        # print(y)
        z = y + self.ffn_block(self.prebn2(y))
        return z

mindcv.models.repmlp.repmlp_b224(pretrained=False, image_size=224, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_b224 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
418
419
420
421
422
423
424
425
426
427
428
429
430
431
@register_model
def repmlp_b224(pretrained: bool = False, image_size: int = 224, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_b224 model.
    Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_b224"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(96, 192, 384, 768), hs=(56, 28, 14, 7),
                      ws=(56, 28, 14, 7),
                      num_blocks=(2, 2, 12, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 32, 128),
                      deploy=deploy)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.repmlp.repmlp_b256(pretrained=False, image_size=256, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_b256 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
434
435
436
437
438
439
440
441
442
443
444
445
446
447
@register_model
def repmlp_b256(pretrained: bool = False, image_size: int = 256, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_b256 model.
    Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_b256"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(96, 192, 384, 768), hs=(64, 32, 16, 8),
                      ws=(64, 32, 16, 8),
                      num_blocks=(2, 2, 12, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 32, 128),
                      deploy=deploy)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.repmlp.repmlp_d256(pretrained=False, image_size=256, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_d256 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
450
451
452
453
454
455
456
457
458
459
460
461
462
463
@register_model
def repmlp_d256(pretrained: bool = False, image_size: int = 256, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_d256 model.
    Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_d256"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(80, 160, 320, 640), hs=(64, 32, 16, 8),
                      ws=(64, 32, 16, 8),
                      num_blocks=(2, 2, 18, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 16, 128),
                      deploy=deploy)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.repmlp.repmlp_l256(pretrained=False, image_size=256, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_l256 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
466
467
468
469
470
471
472
473
474
475
476
477
478
479
@register_model
def repmlp_l256(pretrained: bool = False, image_size: int = 256, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_l256 model.
    Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_l256"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(96, 192, 384, 768), hs=(64, 32, 16, 8),
                      ws=(64, 32, 16, 8),
                      num_blocks=(2, 2, 18, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 32, 256),
                      deploy=deploy)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.repmlp.repmlp_t224(pretrained=False, image_size=224, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_t224 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
@register_model
def repmlp_t224(pretrained: bool = False, image_size: int = 224, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_t224 model. Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_t224"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(64, 128, 256, 512), hs=(56, 28, 14, 7),
                      ws=(56, 28, 14, 7),
                      num_blocks=(2, 2, 6, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 16, 128),
                      deploy=deploy)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.repmlp.repmlp_t256(pretrained=False, image_size=256, num_classes=1000, in_channels=3, deploy=False, **kwargs)

Get repmlp_t256 model. Refer to the base class models.RepMLPNet for more details.

Source code in mindcv\models\repmlp.py
402
403
404
405
406
407
408
409
410
411
412
413
414
415
@register_model
def repmlp_t256(pretrained: bool = False, image_size: int = 256, num_classes: int = 1000, in_channels=3,
                deploy=False, **kwargs):
    """Get repmlp_t256 model.
    Refer to the base class `models.RepMLPNet` for more details."""
    default_cfg = default_cfgs["repmlp_t256"]
    model = RepMLPNet(in_channels=in_channels, num_class=num_classes, channels=(64, 128, 256, 512), hs=(64, 32, 16, 8),
                      ws=(64, 32, 16, 8),
                      num_blocks=(2, 2, 6, 2), reparam_conv_k=(1, 3), sharesets_nums=(1, 4, 16, 128),
                      deploy=deploy)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

repvgg

mindcv.models.repvgg

MindSpore implementation of RepVGG. Refer to RepVGG: Making VGG_style ConvNets Great Again

mindcv.models.repvgg.RepVGG

Bases: Cell

RepVGG model class, based on "RepVGGBlock: An all-MLP Architecture for Vision" <https://arxiv.org/pdf/2101.03697>_

PARAMETER DESCRIPTION
num_blocks

number of RepVGGBlocks

TYPE: list)

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: in_channels) DEFAULT: 3

width_multiplier

the numbers of MLP Architecture.

TYPE: list) DEFAULT: None

override_group_map

the numbers of MLP Architecture.

TYPE: dict) DEFAULT: None

deploy

use rbr_reparam block or not. Default: False

TYPE: bool) DEFAULT: False

use_se

use se_block or not. Default: False

TYPE: bool) DEFAULT: False

Source code in mindcv\models\repvgg.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class RepVGG(nn.Cell):
    r"""RepVGG model class, based on
    `"RepVGGBlock: An all-MLP Architecture for Vision" <https://arxiv.org/pdf/2101.03697>`_

    Args:
        num_blocks (list) : number of RepVGGBlocks
        num_classes (int) : number of classification classes. Default: 1000.
        in_channels (in_channels) : number the channels of the input. Default: 3.
        width_multiplier (list) : the numbers of MLP Architecture.
        override_group_map (dict) : the numbers of MLP Architecture.
        deploy (bool) : use rbr_reparam block or not. Default: False
        use_se (bool) : use se_block or not. Default: False
    """

    def __init__(self, num_blocks, num_classes=1000, in_channels=3, width_multiplier=None, override_group_map=None,
                 deploy=False, use_se=False):
        super().__init__()

        assert len(width_multiplier) == 4

        self.deploy = deploy
        self.override_group_map = override_group_map or {}
        self.use_se = use_se

        assert 0 not in self.override_group_map

        self.in_planes = min(64, int(64 * width_multiplier[0]))

        self.stage0 = RepVGGBlock(in_channels=in_channels, out_channels=self.in_planes, kernel_size=3, stride=2,
                                  padding=1,
                                  deploy=self.deploy, use_se=self.use_se)
        self.feature_info = [dict(chs=self.in_planes, reduction=2, name="stage0")]
        self.cur_layer_idx = 1
        self.stage1 = self._make_stage(
            int(64 * width_multiplier[0]), num_blocks[0], stride=2)
        self.feature_info.append(dict(chs=int(64 * width_multiplier[0]), reduction=4, name="stage1"))
        self.stage2 = self._make_stage(
            int(128 * width_multiplier[1]), num_blocks[1], stride=2)
        self.feature_info.append(dict(chs=int(128 * width_multiplier[1]), reduction=8, name="stage2"))
        self.stage3 = self._make_stage(
            int(256 * width_multiplier[2]), num_blocks[2], stride=2)
        self.feature_info.append(dict(chs=int(256 * width_multiplier[2]), reduction=16, name="stage3"))
        self.stage4 = self._make_stage(
            int(512 * width_multiplier[3]), num_blocks[3], stride=2)
        self.feature_info.append(dict(chs=int(512 * width_multiplier[3]), reduction=32, name="stage4"))
        self.gap = GlobalAvgPooling()
        self.linear = nn.Dense(int(512 * width_multiplier[3]), num_classes)
        self._initialize_weights()

    def _make_stage(self, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        blocks = []
        for s in strides:
            cur_group = self.override_group_map.get(self.cur_layer_idx, 1)
            blocks.append(RepVGGBlock(in_channels=self.in_planes, out_channels=planes, kernel_size=3,
                                      stride=s, padding=1, group=cur_group, deploy=self.deploy,
                                      use_se=self.use_se))
            self.in_planes = planes
            self.cur_layer_idx += 1

        return nn.SequentialCell(blocks)

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(mode='fan_out', nonlinearity='relu'),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer('zeros', cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer('ones', cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer('zeros', cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(mode='fan_in', nonlinearity='sigmoid'),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer('zeros', cell.bias.shape, cell.bias.dtype))

    def construct(self, x):
        x = self.stage0(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.gap(x)
        x = self.linear(x)
        return x

mindcv.models.repvgg.RepVGGBlock

Bases: Cell

Basic Block of RepVGG

Source code in mindcv\models\repvgg.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
class RepVGGBlock(nn.Cell):
    """Basic Block of RepVGG"""

    def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
                 stride: int = 1, padding: int = 0, dilation: int = 1,
                 group: int = 1, padding_mode: str = "zeros",
                 deploy: bool = False, use_se: bool = False) -> None:
        super().__init__()
        self.deploy = deploy
        self.group = group
        self.in_channels = in_channels

        assert kernel_size == 3
        assert padding == 1

        padding_11 = padding - kernel_size // 2

        self.nonlinearity = nn.ReLU()

        if use_se:
            self.se = SqueezeExcite(
                in_channels=out_channels, rd_channels=out_channels // 16)
        else:
            self.se = Identity()

        if deploy:
            self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
                                         stride=stride, padding=padding, dilation=dilation, group=group, has_bias=True,
                                         pad_mode=padding_mode)
        else:
            self.rbr_reparam = None
            self.rbr_identity = nn.BatchNorm2d(
                num_features=in_channels) if out_channels == in_channels and stride == 1 else None

            self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
                                     stride=stride, padding=padding, group=group)
            self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride,
                                   padding=padding_11, group=group)

    def construct(self, inputs: Tensor) -> Tensor:
        if self.rbr_reparam is not None:
            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))

        if self.rbr_identity is None:
            id_out = 0
        else:
            id_out = self.rbr_identity(inputs)

        return self.nonlinearity(self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))

    def get_custom_l2(self):
        """This may improve the accuracy and facilitates quantization in some cases."""
        k3 = self.rbr_dense.conv.weight
        k1 = self.rbr_1x1.conv.weight

        t3 = self.rbr_dense.bn.weight / (
            ops.sqrt((self.rbr_dense.bn.moving_variance + self.rbr_dense.bn.eps)))
        t3 = ops.reshape(t3, (-1, 1, 1, 1))

        t1 = (self.rbr_1x1.bn.weight /
              ((self.rbr_1x1.bn.moving_variance + self.rbr_1x1.bn.eps).sqrt()))
        t1 = ops.reshape(t1, (-1, 1, 1, 1))

        l2_loss_circle = ops.reduce_sum(k3 ** 2) - ops.reduce_sum(k3[:, :, 1:2, 1:2] ** 2)
        eq_kernel = k3[:, :, 1:2, 1:2] * t3 + k1 * t1
        l2_loss_eq_kernel = ops.reduce_sum(eq_kernel ** 2 / (t3 ** 2 + t1 ** 2))
        return l2_loss_eq_kernel + l2_loss_circle

    #   This func derives the equivalent kernel and bias in a DIFFERENTIABLE way.
    #   You can get the equivalent kernel and bias at any time and do whatever you want,
    #   for example, apply some penalties or constraints during training, just like you do to the other models.
    #   May be useful for quantization or pruning.
    def get_equivalent_kernel_bias(self):
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid

    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        if kernel1x1 is None:
            return 0
        return ops.pad(kernel1x1, ((1, 1), (1, 1)))

    def _fuse_bn_tensor(self, branch):
        if branch is None:
            return 0, 0
        if isinstance(branch, nn.SequentialCell):
            kernel = branch.conv.weight
            moving_mean = branch.bn.moving_mean
            moving_variance = branch.bn.moving_variance
            gamma = branch.bn.gamma
            beta = branch.bn.beta
            eps = branch.bn.eps
        else:
            assert isinstance(branch, (nn.BatchNorm2d, nn.SyncBatchNorm))
            if not hasattr(self, "id_tensor"):
                input_dim = self.in_channels // self.group
                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
                for i in range(self.in_channels):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = Tensor(kernel_value, dtype=branch.weight.dtype)
            kernel = self.id_tensor
            moving_mean = branch.moving_mean
            moving_variance = branch.moving_variance
            gamma = branch.gamma
            beta = branch.beta
            eps = branch.eps
        std = ops.sqrt(moving_variance + eps)
        t = ops.reshape(gamma / std, (-1, 1, 1, 1))
        return kernel * t, beta - moving_mean * gamma / std

    def switch_to_deploy(self):
        """Model_convert"""
        if self.rbr_reparam is not None:
            return
        kernel, bias = self.get_equivalent_kernel_bias()
        self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels,
                                     out_channels=self.rbr_dense.conv.out_channels,
                                     kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
                                     padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation,
                                     group=self.rbr_dense.conv.group, has_bias=True, pad_mode="pad")
        self.rbr_reparam.weight.data = kernel
        self.rbr_reparam.bias.data = bias
        for para in self.parameters():
            para.detach_()
        self.__delattr__("rbr_dense")
        self.__delattr__("rbr_1x1")
        if hasattr(self, "rbr_identity"):
            self.__delattr__("rbr_identity")
        if hasattr(self, "id_tensor"):
            self.__delattr__("id_tensor")
        self.deploy = True
mindcv.models.repvgg.RepVGGBlock.get_custom_l2()

This may improve the accuracy and facilitates quantization in some cases.

Source code in mindcv\models\repvgg.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def get_custom_l2(self):
    """This may improve the accuracy and facilitates quantization in some cases."""
    k3 = self.rbr_dense.conv.weight
    k1 = self.rbr_1x1.conv.weight

    t3 = self.rbr_dense.bn.weight / (
        ops.sqrt((self.rbr_dense.bn.moving_variance + self.rbr_dense.bn.eps)))
    t3 = ops.reshape(t3, (-1, 1, 1, 1))

    t1 = (self.rbr_1x1.bn.weight /
          ((self.rbr_1x1.bn.moving_variance + self.rbr_1x1.bn.eps).sqrt()))
    t1 = ops.reshape(t1, (-1, 1, 1, 1))

    l2_loss_circle = ops.reduce_sum(k3 ** 2) - ops.reduce_sum(k3[:, :, 1:2, 1:2] ** 2)
    eq_kernel = k3[:, :, 1:2, 1:2] * t3 + k1 * t1
    l2_loss_eq_kernel = ops.reduce_sum(eq_kernel ** 2 / (t3 ** 2 + t1 ** 2))
    return l2_loss_eq_kernel + l2_loss_circle
mindcv.models.repvgg.RepVGGBlock.switch_to_deploy()

Model_convert

Source code in mindcv\models\repvgg.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def switch_to_deploy(self):
    """Model_convert"""
    if self.rbr_reparam is not None:
        return
    kernel, bias = self.get_equivalent_kernel_bias()
    self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels,
                                 out_channels=self.rbr_dense.conv.out_channels,
                                 kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
                                 padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation,
                                 group=self.rbr_dense.conv.group, has_bias=True, pad_mode="pad")
    self.rbr_reparam.weight.data = kernel
    self.rbr_reparam.bias.data = bias
    for para in self.parameters():
        para.detach_()
    self.__delattr__("rbr_dense")
    self.__delattr__("rbr_1x1")
    if hasattr(self, "rbr_identity"):
        self.__delattr__("rbr_identity")
    if hasattr(self, "id_tensor"):
        self.__delattr__("id_tensor")
    self.deploy = True

mindcv.models.repvgg.repvgg_a0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[0.75, 0.75, 0.75, 2.5]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
298
299
300
301
302
303
304
305
306
@register_model
def repvgg_a0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[0.75, 0.75, 0.75, 2.5].
    Refer to the base class `models.RepVGG` for more details.
    """
    default_cfg = default_cfgs["repvgg_a0"]
    model_args = dict(num_blocks=[2, 4, 14, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[0.75, 0.75, 0.75, 2.5], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_a1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[1.0, 1.0, 1.0, 2.5]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
309
310
311
312
313
314
315
316
317
@register_model
def repvgg_a1(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[1.0, 1.0, 1.0, 2.5].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs["repvgg_a1"]
    model_args = dict(num_blocks=[2, 4, 14, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[1.0, 1.0, 1.0, 2.5], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_a2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[1.5, 1.5, 1.5, 2.75]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
320
321
322
323
324
325
326
327
328
@register_model
def repvgg_a2(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[2, 4, 14, 1], width_multiplier=[1.5, 1.5, 1.5, 2.75].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs["repvgg_a2"]
    model_args = dict(num_blocks=[2, 4, 14, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[1.5, 1.5, 1.5, 2.75], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[1.0, 1.0, 1.0, 2.5]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
331
332
333
334
335
336
337
338
339
@register_model
def repvgg_b0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[1.0, 1.0, 1.0, 2.5].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs['repvgg_b0']
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[1.0, 1.0, 1.0, 2.5], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
342
343
344
345
346
347
348
349
350
@register_model
def repvgg_b1(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs['repvgg_b1']
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[2.0, 2.0, 2.0, 4.0], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b1g2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
380
381
382
383
384
385
386
387
388
@register_model
def repvgg_b1g2(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0].
    Refer to the base class `models.RepVGG` for more details.
    """
    default_cfg = default_cfgs["repvgg_b1g2"]
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[2.0, 2.0, 2.0, 4.0], override_group_map=g2_map, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b1g4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
391
392
393
394
395
396
397
398
399
@register_model
def repvgg_b1g4(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.0, 2.0, 2.0, 4.0].
    Refer to the base class `models.RepVGG` for more details.
    """
    default_cfg = default_cfgs["repvgg_b1g4"]
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[2.0, 2.0, 2.0, 4.0], override_group_map=g4_map, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.5, 2.5, 2.5, 5.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
353
354
355
356
357
358
359
360
361
@register_model
def repvgg_b2(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.5, 2.5, 2.5, 5.0].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs['repvgg_b2']
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[2.5, 2.5, 2.5, 5.0], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b2g4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.5, 2.5, 2.5, 5.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
402
403
404
405
406
407
408
409
410
@register_model
def repvgg_b2g4(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[2.5, 2.5, 2.5, 5.0].
    Refer to the base class `models.RepVGG` for more details.
    """
    default_cfg = default_cfgs["repvgg_b2g4"]
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[2.5, 2.5, 2.5, 5.0], override_group_map=g4_map, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_b3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[3.0, 3.0, 3.0, 5.0]. Refer to the base class models.RepVGG for more details.

Source code in mindcv\models\repvgg.py
364
365
366
367
368
369
370
371
372
@register_model
def repvgg_b3(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> RepVGG:
    """Get RepVGG model with num_blocks=[4, 6, 16, 1], width_multiplier=[3.0, 3.0, 3.0, 5.0].
     Refer to the base class `models.RepVGG` for more details.
     """
    default_cfg = default_cfgs['repvgg_b3']
    model_args = dict(num_blocks=[4, 6, 16, 1], num_classes=num_classes, in_channels=in_channels,
                      width_multiplier=[3.0, 3.0, 3.0, 5.0], override_group_map=None, deploy=False, **kwargs)
    return _create_repvgg(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.repvgg.repvgg_model_convert(model, save_path=None, do_copy=True)

repvgg_model_convert

Source code in mindcv\models\repvgg.py
413
414
415
416
417
418
419
420
421
422
def repvgg_model_convert(model: nn.Cell, save_path=None, do_copy=True):
    """repvgg_model_convert"""
    if do_copy:
        model = copy.deepcopy(model)
    for module in model.modules():
        if hasattr(module, "switch_to_deploy"):
            module.switch_to_deploy()
    if save_path is not None:
        save_checkpoint(model.parameters_and_names(), save_path)
    return model

res2net

mindcv.models.res2net

MindSpore implementation of Res2Net. Refer to Res2Net: A New Multi-scale Backbone Architecture.

mindcv.models.res2net.Res2Net

Bases: Cell

Res2Net model class, based on "Res2Net: A New Multi-scale Backbone Architecture" <https://arxiv.org/abs/1904.01169>_

PARAMETER DESCRIPTION
block

block of resnet.

TYPE: Type[Cell]

layer_nums

number of layers of each stage.

TYPE: List[int]

version

variety of Res2Net, 'res2net' or 'res2net_v1b'. Default: 'res2net'.

TYPE: str DEFAULT: 'res2net'

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

groups

number of groups for group conv in blocks. Default: 1.

TYPE: int DEFAULT: 1

base_width

base width of pre group hidden channel in blocks. Default: 26.

TYPE: int DEFAULT: 26

scale

scale factor of Bottle2neck. Default: 4.

DEFAULT: 4

norm

normalization layer in blocks. Default: None.

TYPE: Optional[Cell] DEFAULT: None

Source code in mindcv\models\res2net.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
class Res2Net(nn.Cell):
    r"""Res2Net model class, based on
    `"Res2Net: A New Multi-scale Backbone Architecture" <https://arxiv.org/abs/1904.01169>`_

    Args:
        block: block of resnet.
        layer_nums: number of layers of each stage.
        version: variety of Res2Net, 'res2net' or 'res2net_v1b'. Default: 'res2net'.
        num_classes: number of classification classes. Default: 1000.
        in_channels: number the channels of the input. Default: 3.
        groups: number of groups for group conv in blocks. Default: 1.
        base_width: base width of pre group hidden channel in blocks. Default: 26.
        scale: scale factor of Bottle2neck. Default: 4.
        norm: normalization layer in blocks. Default: None.
    """

    def __init__(
        self,
        block: Type[nn.Cell],
        layer_nums: List[int],
        version: str = "res2net",
        num_classes: int = 1000,
        in_channels: int = 3,
        groups: int = 1,
        base_width: int = 26,
        scale=4,
        norm: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        assert version in ["res2net", "res2net_v1b"]
        self.version = version

        if norm is None:
            norm = nn.BatchNorm2d
        self.norm = norm

        self.num_classes = num_classes
        self.input_channels = 64
        self.groups = groups
        self.base_width = base_width
        self.scale = scale
        if self.version == "res2net":
            self.conv1 = nn.Conv2d(in_channels, self.input_channels, kernel_size=7,
                                   stride=2, padding=3, pad_mode="pad")
        elif self.version == "res2net_v1b":
            self.conv1 = nn.SequentialCell([
                nn.Conv2d(in_channels, self.input_channels // 2, kernel_size=3,
                          stride=2, padding=1, pad_mode="pad"),
                norm(self.input_channels // 2),
                nn.ReLU(),
                nn.Conv2d(self.input_channels // 2, self.input_channels // 2, kernel_size=3,
                          stride=1, padding=1, pad_mode="pad"),
                norm(self.input_channels // 2),
                nn.ReLU(),
                nn.Conv2d(self.input_channels // 2, self.input_channels, kernel_size=3,
                          stride=1, padding=1, pad_mode="pad"),
            ])

        self.bn1 = norm(self.input_channels)
        self.relu = nn.ReLU()
        self.max_pool = nn.SequentialCell([
            nn.Pad(paddings=((0, 0), (0, 0), (1, 1), (1, 1)), mode="CONSTANT"),
            nn.MaxPool2d(kernel_size=3, stride=2)
        ])
        self.layer1 = self._make_layer(block, 64, layer_nums[0])
        self.layer2 = self._make_layer(block, 128, layer_nums[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layer_nums[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layer_nums[3], stride=2)

        self.pool = GlobalAvgPooling()
        self.num_features = 512 * block.expansion
        self.classifier = nn.Dense(self.num_features, num_classes)
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(math.sqrt(5), mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                         cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def _make_layer(
        self,
        block: Type[nn.Cell],
        channels: int,
        block_nums: int,
        stride: int = 1,
    ) -> nn.SequentialCell:
        down_sample = None

        if stride != 1 or self.input_channels != channels * block.expansion:
            if stride == 1 or self.version == "res2net":
                down_sample = nn.SequentialCell([
                    nn.Conv2d(self.input_channels, channels * block.expansion, kernel_size=1, stride=stride),
                    self.norm(channels * block.expansion)
                ])
            else:
                down_sample = nn.SequentialCell([
                    nn.AvgPool2d(kernel_size=stride, stride=stride, pad_mode="same"),
                    nn.Conv2d(self.input_channels, channels * block.expansion, kernel_size=1, stride=1),
                    self.norm(channels * block.expansion)
                ])

        layers = []
        layers.append(
            block(
                self.input_channels,
                channels,
                stride=stride,
                down_sample=down_sample,
                groups=self.groups,
                base_width=self.base_width,
                scale=self.scale,
                stype="stage",
                norm=self.norm,
            )
        )
        self.input_channels = channels * block.expansion

        for _ in range(1, block_nums):
            layers.append(
                block(
                    self.input_channels,
                    channels,
                    groups=self.groups,
                    base_width=self.base_width,
                    scale=self.scale,
                    norm=self.norm,
                )
            )

        return nn.SequentialCell(layers)

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.max_pool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.res2net.res2net101(pretrained=False, num_classes=1001, in_channels=3, **kwargs)

Get 101 layers Res2Net model. Refer to the base class models.Res2Net for more details.

Source code in mindcv\models\res2net.py
326
327
328
329
330
331
332
333
334
335
336
337
@register_model
def res2net101(pretrained: bool = False, num_classes: int = 1001, in_channels=3, **kwargs):
    """Get 101 layers Res2Net model.
    Refer to the base class `models.Res2Net` for more details.
    """
    default_cfg = default_cfgs["res2net101"]
    model = Res2Net(Bottle2neck, [3, 4, 23, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.res2net.res2net152(pretrained=False, num_classes=1001, in_channels=3, **kwargs)

Get 152 layers Res2Net model. Refer to the base class models.Res2Net for more details.

Source code in mindcv\models\res2net.py
340
341
342
343
344
345
346
347
348
349
350
351
@register_model
def res2net152(pretrained: bool = False, num_classes: int = 1001, in_channels=3, **kwargs):
    """Get 152 layers Res2Net model.
    Refer to the base class `models.Res2Net` for more details.
    """
    default_cfg = default_cfgs["res2net152"]
    model = Res2Net(Bottle2neck, [3, 8, 36, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.res2net.res2net50(pretrained=False, num_classes=1001, in_channels=3, **kwargs)

Get 50 layers Res2Net model. Refer to the base class models.Res2Net for more details.

Source code in mindcv\models\res2net.py
312
313
314
315
316
317
318
319
320
321
322
323
@register_model
def res2net50(pretrained: bool = False, num_classes: int = 1001, in_channels=3, **kwargs):
    """Get 50 layers Res2Net model.
    Refer to the base class `models.Res2Net` for more details.
    """
    default_cfg = default_cfgs["res2net50"]
    model = Res2Net(Bottle2neck, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

resnest

mindcv.models.resnest

MindSpore implementation of ResNeSt. Refer to ResNeSt: Split-Attention Networks.

mindcv.models.resnest.Bottleneck

Bases: Cell

ResNeSt Bottleneck

Source code in mindcv\models\resnest.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class Bottleneck(nn.Cell):
    """ResNeSt Bottleneck"""

    expansion = 4

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride=1,
        downsample: Optional[nn.SequentialCell] = None,
        radix: int = 1,
        cardinality: int = 1,
        bottleneck_width: int = 64,
        avd: bool = False,
        avd_first: bool = False,
        dilation: int = 1,
        is_first: bool = False,
        norm_layer: Optional[nn.Cell] = None,
    ) -> None:
        super(Bottleneck, self).__init__()
        group_width = int(planes * (bottleneck_width / 64.0)) * cardinality
        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, has_bias=False)
        self.bn1 = norm_layer(group_width)
        self.radix = radix
        self.avd = avd and (stride > 1 or is_first)
        self.avd_first = avd_first

        if self.avd:
            self.avd_layer = nn.AvgPool2d(3, stride, pad_mode="same")
            stride = 1

        if radix >= 1:
            self.conv2 = SplitAttn(group_width, group_width, kernel_size=3, stride=stride,
                                   padding=dilation, dilation=dilation, group=cardinality,
                                   bias=False, radix=radix, norm_layer=norm_layer)
        else:
            self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride,
                                   pad_mode="pad", padding=dilation, dilation=dilation,
                                   group=cardinality, has_bias=False)
            self.bn2 = norm_layer(group_width)

        self.conv3 = nn.Conv2d(group_width, planes * 4, kernel_size=1, has_bias=False)
        self.bn3 = norm_layer(planes * 4)

        self.relu = nn.ReLU()
        self.downsample = downsample
        self.dilation = dilation
        self.stride = stride

    def construct(self, x: Tensor) -> Tensor:
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        if self.avd and self.avd_first:
            out = self.avd_layer(out)

        out = self.conv2(out)
        if self.radix == 0:
            out = self.bn2(out)
            out = self.relu(out)

        if self.avd and not self.avd_first:
            out = self.avd_layer(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

mindcv.models.resnest.ResNeSt

Bases: Cell

ResNeSt model class, based on "ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>_

PARAMETER DESCRIPTION
block

Class for the residual block. Option is Bottleneck.

TYPE: Type[Bottleneck]

layers

Numbers of layers in each block.

TYPE: List[int]

radix

Number of groups for Split-Attention conv. Default: 1.

TYPE: int DEFAULT: 1

group

Number of groups for the conv in each bottleneck block. Default: 1.

TYPE: int DEFAULT: 1

bottleneck_width

bottleneck channels factor. Default: 64.

TYPE: int DEFAULT: 64

num_classes

Number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

dilated

Applying dilation strategy to pretrained ResNeSt yielding a stride-8 model, typically used in Semantic Segmentation. Default: False.

TYPE: bool DEFAULT: False

dilation

Number of dilation in the conv. Default: 1.

TYPE: int DEFAULT: 1

deep_stem

three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2. Default: False.

TYPE: bool DEFAULT: False

stem_width

number of channels in stem convolutions. Default: 64.

TYPE: int DEFAULT: 64

avg_down

use avg pooling for projection skip connection between stages/downsample. Default: False.

TYPE: bool DEFAULT: False

avd

use avg pooling before or after split-attention conv. Default: False.

TYPE: bool DEFAULT: False

avd_first

use avg pooling before or after split-attention conv. Default: False.

TYPE: bool DEFAULT: False

drop_rate

Drop probability for the Dropout layer. Default: 0.

TYPE: float DEFAULT: 0.0

norm_layer

Normalization layer used in backbone network. Default: nn.BatchNorm2d.

TYPE: Cell DEFAULT: BatchNorm2d

Source code in mindcv\models\resnest.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
class ResNeSt(nn.Cell):
    r"""ResNeSt model class, based on
    `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>`_

    Args:
        block: Class for the residual block. Option is Bottleneck.
        layers: Numbers of layers in each block.
        radix: Number of groups for Split-Attention conv. Default: 1.
        group: Number of groups for the conv in each bottleneck block. Default: 1.
        bottleneck_width: bottleneck channels factor. Default: 64.
        num_classes: Number of classification classes. Default: 1000.
        dilated: Applying dilation strategy to pretrained ResNeSt yielding a stride-8 model,
                 typically used in Semantic Segmentation. Default: False.
        dilation: Number of dilation in the conv. Default: 1.
        deep_stem: three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2.
                   Default: False.
        stem_width: number of channels in stem convolutions. Default: 64.
        avg_down: use avg pooling for projection skip connection between stages/downsample.
                  Default: False.
        avd: use avg pooling before or after split-attention conv. Default: False.
        avd_first: use avg pooling before or after split-attention conv. Default: False.
        drop_rate: Drop probability for the Dropout layer. Default: 0.
        norm_layer: Normalization layer used in backbone network. Default: nn.BatchNorm2d.
    """

    def __init__(
        self,
        block: Type[Bottleneck],
        layers: List[int],
        radix: int = 1,
        group: int = 1,
        bottleneck_width: int = 64,
        num_classes: int = 1000,
        dilated: bool = False,
        dilation: int = 1,
        deep_stem: bool = False,
        stem_width: int = 64,
        avg_down: bool = False,
        avd: bool = False,
        avd_first: bool = False,
        drop_rate: float = 0.0,
        norm_layer: nn.Cell = nn.BatchNorm2d,
    ) -> None:
        super(ResNeSt, self).__init__()
        self.cardinality = group
        self.bottleneck_width = bottleneck_width
        # ResNet-D params
        self.inplanes = stem_width * 2 if deep_stem else 64
        self.avg_down = avg_down
        # ResNeSt params
        self.radix = radix
        self.avd = avd
        self.avd_first = avd_first

        if deep_stem:
            self.conv1 = nn.SequentialCell([
                nn.Conv2d(3, stem_width, kernel_size=3, stride=2, pad_mode="pad",
                          padding=1, has_bias=False),
                norm_layer(stem_width),
                nn.ReLU(),
                nn.Conv2d(stem_width, stem_width, kernel_size=3, stride=1, pad_mode="pad",
                          padding=1, has_bias=False),
                norm_layer(stem_width),
                nn.ReLU(),
                nn.Conv2d(stem_width, stem_width * 2, kernel_size=3, stride=1, pad_mode="pad",
                          padding=1, has_bias=False),
            ])
        else:
            self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, pad_mode="pad", padding=3,
                                   has_bias=False)

        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU()
        self.feature_info = [dict(chs=self.inplanes, reduction=2, name="relu")]
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")

        self.layer1 = self._make_layer(block, 64, layers[0], norm_layer=norm_layer, is_first=False)
        self.feature_info.append(dict(chs=block.expansion * 64, reduction=4, name='layer1'))
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, norm_layer=norm_layer)
        self.feature_info.append(dict(chs=block.expansion * 128, reduction=8, name='layer2'))

        if dilated or dilation == 4:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 256, reduction=8, name='layer3'))
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 512, reduction=8, name='layer4'))
        elif dilation == 2:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilation=1, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 256, reduction=16, name='layer3'))
            self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 512, reduction=16, name='layer4'))
        else:
            self.layer3 = self._make_layer(block, 256, layers[2], stride=2, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 256, reduction=16, name='layer3'))
            self.layer4 = self._make_layer(block, 512, layers[3], stride=2, norm_layer=norm_layer)
            self.feature_info.append(dict(chs=block.expansion * 512, reduction=32, name='layer4'))

        self.avgpool = GlobalAvgPooling()
        self.drop = Dropout(p=drop_rate) if drop_rate > 0.0 else None
        self.fc = nn.Dense(512 * block.expansion, num_classes)

        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(
                        init.HeNormal(mode="fan_out", nonlinearity="relu"), cell.weight.shape, cell.weight.dtype
                    )
                )
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(
                        init.HeUniform(mode="fan_in", nonlinearity="sigmoid"), cell.weight.shape, cell.weight.dtype
                    )
                )
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def _make_layer(
        self,
        block: Type[Bottleneck],
        planes: int,
        blocks: int,
        stride: int = 1,
        dilation: int = 1,
        norm_layer: Optional[nn.Cell] = None,
        is_first: bool = True,
    ) -> nn.SequentialCell:
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            down_layers = []
            if self.avg_down:
                if dilation == 1:
                    down_layers.append(nn.AvgPool2d(kernel_size=stride, stride=stride, pad_mode="valid"))
                else:
                    down_layers.append(nn.AvgPool2d(kernel_size=1, stride=1, pad_mode="valid"))

                down_layers.append(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1,
                                             stride=1, has_bias=False))
            else:
                down_layers.append(
                    nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride,
                              has_bias=False))
            down_layers.append(norm_layer(planes * block.expansion))
            downsample = nn.SequentialCell(down_layers)

        layers = []
        if dilation == 1 or dilation == 2:
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    stride,
                    downsample=downsample,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=1,
                    is_first=is_first,
                    norm_layer=norm_layer,
                )
            )
        elif dilation == 4:
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    stride,
                    downsample=downsample,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=2,
                    is_first=is_first,
                    norm_layer=norm_layer,
                )
            )
        else:
            raise ValueError(f"Unsupported model type {dilation}")

        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    radix=self.radix,
                    cardinality=self.cardinality,
                    bottleneck_width=self.bottleneck_width,
                    avd=self.avd,
                    avd_first=self.avd_first,
                    dilation=dilation,
                    norm_layer=norm_layer,
                )
            )

        return nn.SequentialCell(layers)

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.avgpool(x)
        if self.drop:
            x = self.drop(x)
        x = self.fc(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.resnest.SplitAttn

Bases: Cell

Split-Attention Conv2d

Source code in mindcv\models\resnest.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class SplitAttn(nn.Cell):
    """Split-Attention Conv2d"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        padding: int = 0,
        dilation: int = 1,
        group: int = 1,
        bias: bool = False,
        radix: int = 2,
        rd_ratio: float = 0.25,
        rd_channels: Optional[int] = None,
        rd_divisor: int = 8,
        act_layer: nn.Cell = nn.ReLU,
        norm_layer: Optional[nn.Cell] = None,
    ) -> None:
        super(SplitAttn, self).__init__()
        out_channels = out_channels or in_channels
        self.radix = radix
        mid_chs = out_channels * radix

        if rd_channels is None:
            attn_chs = make_divisible(in_channels * radix * rd_ratio, min_value=32, divisor=rd_divisor)
        else:
            attn_chs = rd_channels * radix

        padding = kernel_size // 2 if padding is None else padding

        self.conv = nn.Conv2d(in_channels, mid_chs, kernel_size=kernel_size, stride=stride,
                              pad_mode="pad", padding=padding, dilation=dilation,
                              group=group * radix, has_bias=bias)
        self.bn0 = norm_layer(mid_chs) if norm_layer else Identity()
        self.act0 = act_layer()
        self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, group=group, has_bias=True)
        self.bn1 = norm_layer(attn_chs) if norm_layer else nn.Identity()
        self.act1 = act_layer()
        self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, group=group, has_bias=True)
        self.rsoftmax = RadixSoftmax(radix, group)
        self.pool = GlobalAvgPooling(keep_dims=True)

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv(x)
        x = self.bn0(x)
        x = self.act0(x)

        B, RC, H, W = x.shape
        if self.radix > 1:
            x = ops.reshape(x, (B, self.radix, RC // self.radix, H, W))
            x_gap = x.sum(axis=1)
        else:
            x_gap = x
        x_gap = self.pool(x_gap)
        x_gap = self.fc1(x_gap)
        x_gap = self.bn1(x_gap)
        x_gap = self.act1(x_gap)
        x_attn = self.fc2(x_gap)

        x_attn = self.rsoftmax(x_attn)
        x_attn = ops.reshape(x_attn, (B, -1, 1, 1))
        if self.radix > 1:
            out = x * ops.reshape(x_attn, (B, self.radix, RC // self.radix, 1, 1))
            out = out.sum(axis=1)
        else:
            out = x * x_attn

        return out

resnet

mindcv.models.resnet

MindSpore implementation of ResNet. Refer to Deep Residual Learning for Image Recognition.

mindcv.models.resnet.BasicBlock

Bases: Cell

define the basic block of resnet

Source code in mindcv\models\resnet.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class BasicBlock(nn.Cell):
    """define the basic block of resnet"""
    expansion: int = 1

    def __init__(
        self,
        in_channels: int,
        channels: int,
        stride: int = 1,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        down_sample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d
        assert groups == 1, "BasicBlock only supports groups=1"
        assert base_width == 64, "BasicBlock only supports base_width=64"

        self.conv1 = nn.Conv2d(in_channels, channels, kernel_size=3,
                               stride=stride, padding=1, pad_mode="pad")
        self.bn1 = norm(channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3,
                               stride=1, padding=1, pad_mode="pad")
        self.bn2 = norm(channels)
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.down_sample is not None:
            identity = self.down_sample(x)

        out += identity
        out = self.relu(out)

        return out

mindcv.models.resnet.Bottleneck

Bases: Cell

Bottleneck here places the stride for downsampling at 3x3 convolution(self.conv2) as torchvision does, while original implementation places the stride at the first 1x1 convolution(self.conv1)

Source code in mindcv\models\resnet.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
class Bottleneck(nn.Cell):
    """
    Bottleneck here places the stride for downsampling at 3x3 convolution(self.conv2) as torchvision does,
    while original implementation places the stride at the first 1x1 convolution(self.conv1)
    """
    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        stride: int = 1,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        down_sample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d

        width = int(channels * (base_width / 64.0)) * groups

        self.conv1 = nn.Conv2d(in_channels, width, kernel_size=1, stride=1)
        self.bn1 = norm(width)
        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride,
                               padding=1, pad_mode="pad", group=groups)
        self.bn2 = norm(width)
        self.conv3 = nn.Conv2d(width, channels * self.expansion,
                               kernel_size=1, stride=1)
        self.bn3 = norm(channels * self.expansion)
        self.relu = nn.ReLU()
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.down_sample is not None:
            identity = self.down_sample(x)

        out += identity
        out = self.relu(out)

        return out

mindcv.models.resnet.ResNet

Bases: Cell

ResNet model class, based on "Deep Residual Learning for Image Recognition" <https://arxiv.org/abs/1512.03385>_

PARAMETER DESCRIPTION
block

block of resnet.

TYPE: Type[Union[BasicBlock, Bottleneck]]

layers

number of layers of each stage.

TYPE: List[int]

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

groups

number of groups for group conv in blocks. Default: 1.

TYPE: int DEFAULT: 1

base_width

base width of pre group hidden channel in blocks. Default: 64.

TYPE: int DEFAULT: 64

norm

normalization layer in blocks. Default: None.

TYPE: Optional[Cell] DEFAULT: None

Source code in mindcv\models\resnet.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
class ResNet(nn.Cell):
    r"""ResNet model class, based on
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/abs/1512.03385>`_

    Args:
        block: block of resnet.
        layers: number of layers of each stage.
        num_classes: number of classification classes. Default: 1000.
        in_channels: number the channels of the input. Default: 3.
        groups: number of groups for group conv in blocks. Default: 1.
        base_width: base width of pre group hidden channel in blocks. Default: 64.
        norm: normalization layer in blocks. Default: None.
    """

    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        num_classes: int = 1000,
        in_channels: int = 3,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d

        self.norm: nn.Cell = norm  # add type hints to make pylint happy
        self.input_channels = 64
        self.groups = groups
        self.base_with = base_width

        self.conv1 = nn.Conv2d(in_channels, self.input_channels, kernel_size=7,
                               stride=2, pad_mode="pad", padding=3)
        self.bn1 = norm(self.input_channels)
        self.relu = nn.ReLU()
        self.feature_info = [dict(chs=self.input_channels, reduction=2, name="relu")]
        self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.feature_info.append(dict(chs=block.expansion * 64, reduction=4, name="layer1"))
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.feature_info.append(dict(chs=block.expansion * 128, reduction=8, name="layer2"))
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.feature_info.append(dict(chs=block.expansion * 256, reduction=16, name="layer3"))
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.feature_info.append(dict(chs=block.expansion * 512, reduction=32, name="layer4"))

        self.pool = GlobalAvgPooling()
        self.num_features = 512 * block.expansion
        self.classifier = nn.Dense(self.num_features, num_classes)

        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(mode='fan_out', nonlinearity='relu'),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer('zeros', cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer('ones', cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer('zeros', cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(mode='fan_in', nonlinearity='sigmoid'),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer('zeros', cell.bias.shape, cell.bias.dtype))

    def _make_layer(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        channels: int,
        block_nums: int,
        stride: int = 1,
    ) -> nn.SequentialCell:
        """build model depending on cfgs"""
        down_sample = None

        if stride != 1 or self.input_channels != channels * block.expansion:
            down_sample = nn.SequentialCell([
                nn.Conv2d(self.input_channels, channels * block.expansion, kernel_size=1, stride=stride),
                self.norm(channels * block.expansion)
            ])

        layers = []
        layers.append(
            block(
                self.input_channels,
                channels,
                stride=stride,
                down_sample=down_sample,
                groups=self.groups,
                base_width=self.base_with,
                norm=self.norm,
            )
        )
        self.input_channels = channels * block.expansion

        for _ in range(1, block_nums):
            layers.append(
                block(
                    self.input_channels,
                    channels,
                    groups=self.groups,
                    base_width=self.base_with,
                    norm=self.norm
                )
            )

        return nn.SequentialCell(layers)

    def forward_features(self, x: Tensor) -> Tensor:
        """Network forward feature extraction."""
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.max_pool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x
mindcv.models.resnet.ResNet.forward_features(x)

Network forward feature extraction.

Source code in mindcv\models\resnet.py
280
281
282
283
284
285
286
287
288
289
290
291
def forward_features(self, x: Tensor) -> Tensor:
    """Network forward feature extraction."""
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.max_pool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    return x

mindcv.models.resnet.resnet101(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 101 layers ResNet model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
341
342
343
344
345
346
347
348
349
@register_model
def resnet101(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 101 layers ResNet model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnet101"]
    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], num_classes=num_classes, in_channels=in_channels,
                      **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnet152(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 152 layers ResNet model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
352
353
354
355
356
357
358
359
360
@register_model
def resnet152(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 152 layers ResNet model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnet152"]
    model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], num_classes=num_classes, in_channels=in_channels,
                      **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnet18(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 18 layers ResNet model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
308
309
310
311
312
313
314
315
316
@register_model
def resnet18(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 18 layers ResNet model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnet18"]
    model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], num_classes=num_classes, in_channels=in_channels,
                      **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnet34(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 34 layers ResNet model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
319
320
321
322
323
324
325
326
327
@register_model
def resnet34(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 34 layers ResNet model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnet34"]
    model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels,
                      **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnet50(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers ResNet model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
330
331
332
333
334
335
336
337
338
@register_model
def resnet50(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 50 layers ResNet model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnet50"]
    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels,
                      **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnext101_32x4d(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 101 layers ResNeXt model with 32 groups of GPConv. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
374
375
376
377
378
379
380
381
382
@register_model
def resnext101_32x4d(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 101 layers ResNeXt model with 32 groups of GPConv.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnext101_32x4d"]
    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], groups=32, base_width=4, num_classes=num_classes,
                      in_channels=in_channels, **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnext101_64x4d(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 101 layers ResNeXt model with 64 groups of GPConv. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
385
386
387
388
389
390
391
392
393
@register_model
def resnext101_64x4d(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 101 layers ResNeXt model with 64 groups of GPConv.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnext101_64x4d"]
    model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], groups=64, base_width=4, num_classes=num_classes,
                      in_channels=in_channels, **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

mindcv.models.resnet.resnext50_32x4d(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers ResNeXt model with 32 groups of GPConv. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnet.py
363
364
365
366
367
368
369
370
371
@register_model
def resnext50_32x4d(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 50 layers ResNeXt model with 32 groups of GPConv.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnext50_32x4d"]
    model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], groups=32, base_width=4, num_classes=num_classes,
                      in_channels=in_channels, **kwargs)
    return _create_resnet(pretrained, **dict(default_cfg=default_cfg, **model_args))

resnetv2

mindcv.models.resnetv2

MindSpore implementation of ResNetV2. Refer to Identity Mappings in Deep Residual Networks.

mindcv.models.resnetv2.resnetv2_101(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 101 layers ResNetV2 model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnetv2.py
108
109
110
111
112
113
114
115
116
117
118
119
@register_model
def resnetv2_101(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 101 layers ResNetV2 model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs["resnetv2_101"]
    model = ResNet(PreActBottleneck, [3, 4, 23, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.resnetv2.resnetv2_50(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers ResNetV2 model. Refer to the base class models.ResNet for more details.

Source code in mindcv\models\resnetv2.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@register_model
def resnetv2_50(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """Get 50 layers ResNetV2 model.
    Refer to the base class `models.ResNet` for more details.
    """
    default_cfg = default_cfgs['resnetv2_50']
    model = ResNet(PreActBottleneck, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

rexnet

mindcv.models.rexnet

MindSpore implementation of ReXNet. Refer to ReXNet: Rethinking Channel Dimensions for Efficient Model Design.

mindcv.models.rexnet.LinearBottleneck

Bases: Cell

LinearBottleneck

Source code in mindcv\models\rexnet.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class LinearBottleneck(nn.Cell):
    """LinearBottleneck"""

    def __init__(
        self,
        in_channels,
        out_channels,
        exp_ratio,
        stride,
        use_se=True,
        se_ratio=1 / 12,
        ch_div=1,
        act_layer=nn.SiLU,
        dw_act_layer=nn.ReLU6,
        drop_path=None,
        **kwargs,
    ):
        super(LinearBottleneck, self).__init__(**kwargs)
        self.use_shortcut = stride == 1 and in_channels <= out_channels
        self.in_channels = in_channels
        self.out_channels = out_channels

        if exp_ratio != 1:
            dw_channels = in_channels * exp_ratio
            self.conv_exp = Conv2dNormActivation(in_channels, dw_channels, 1, activation=act_layer)
        else:
            dw_channels = in_channels
            self.conv_exp = None

        self.conv_dw = Conv2dNormActivation(dw_channels, dw_channels, 3, stride, padding=1,
                                            groups=dw_channels, activation=None)

        if use_se:
            self.se = SqueezeExcite(dw_channels,
                                    rd_channels=make_divisible(int(dw_channels * se_ratio), ch_div),
                                    norm=nn.BatchNorm2d)
        else:
            self.se = None
        self.act_dw = dw_act_layer()

        self.conv_pwl = Conv2dNormActivation(dw_channels, out_channels, 1, padding=0, activation=None)
        self.drop_path = drop_path

    def construct(self, x):
        shortcut = x
        if self.conv_exp is not None:
            x = self.conv_exp(x)
        x = self.conv_dw(x)
        if self.se is not None:
            x = self.se(x)
        x = self.act_dw(x)
        x = self.conv_pwl(x)
        if self.use_shortcut:
            if self.drop_path is not None:
                x = self.drop_path(x)
            x[:, 0:self.in_channels] += shortcut
        return x

mindcv.models.rexnet.ReXNetV1

Bases: Cell

ReXNet model class, based on "Rethinking Channel Dimensions for Efficient Model Design" <https://arxiv.org/abs/2007.00992>_

PARAMETER DESCRIPTION
in_channels

number of the input channels. Default: 3.

TYPE: int DEFAULT: 3

fi_channels

number of the final channels. Default: 180.

TYPE: int DEFAULT: 180

initial_channels

initialize inplanes. Default: 16.

TYPE: int DEFAULT: 16

width_mult

The ratio of the channel. Default: 1.0.

TYPE: float DEFAULT: 1.0

depth_mult

The ratio of num_layers. Default: 1.0.

TYPE: float DEFAULT: 1.0

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

use_se

use SENet in LinearBottleneck. Default: True.

TYPE: bool DEFAULT: True

se_ratio

(float): SENet reduction ratio. Default 1/12.

DEFAULT: 1 / 12

drop_rate

dropout ratio. Default: 0.2.

TYPE: float DEFAULT: 0.2

ch_div

divisible by ch_div. Default: 1.

TYPE: int DEFAULT: 1

act_layer

activation function in ConvNormAct. Default: nn.SiLU.

TYPE: Cell DEFAULT: SiLU

dw_act_layer

activation function after dw_conv. Default: nn.ReLU6.

TYPE: Cell DEFAULT: ReLU6

cls_useconv

use conv in classification. Default: False.

TYPE: bool DEFAULT: False

Source code in mindcv\models\rexnet.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
class ReXNetV1(nn.Cell):
    r"""ReXNet model class, based on
    `"Rethinking Channel Dimensions for Efficient Model Design" <https://arxiv.org/abs/2007.00992>`_

    Args:
        in_channels (int): number of the input channels. Default: 3.
        fi_channels (int): number of the final channels. Default: 180.
        initial_channels (int): initialize inplanes. Default: 16.
        width_mult (float): The ratio of the channel. Default: 1.0.
        depth_mult (float): The ratio of num_layers. Default: 1.0.
        num_classes (int) : number of classification classes. Default: 1000.
        use_se (bool): use SENet in LinearBottleneck. Default: True.
        se_ratio: (float): SENet reduction ratio. Default 1/12.
        drop_rate (float): dropout ratio. Default: 0.2.
        ch_div (int): divisible by ch_div. Default: 1.
        act_layer (nn.Cell): activation function in ConvNormAct. Default: nn.SiLU.
        dw_act_layer (nn.Cell): activation function after dw_conv. Default: nn.ReLU6.
        cls_useconv (bool): use conv in classification. Default: False.
    """

    def __init__(
        self,
        in_channels=3,
        fi_channels=180,
        initial_channels=16,
        width_mult=1.0,
        depth_mult=1.0,
        num_classes=1000,
        use_se=True,
        se_ratio=1 / 12,
        drop_rate=0.2,
        drop_path_rate=0.0,
        ch_div=1,
        act_layer=nn.SiLU,
        dw_act_layer=nn.ReLU6,
        cls_useconv=False,
    ):
        super(ReXNetV1, self).__init__()

        layers = [1, 2, 2, 3, 3, 5]
        strides = [1, 2, 2, 2, 1, 2]
        use_ses = [False, False, True, True, True, True]

        layers = [ceil(element * depth_mult) for element in layers]
        strides = sum([[element] + [1] * (layers[idx] - 1)
                       for idx, element in enumerate(strides)], [])
        if use_se:
            use_ses = sum([[element] * layers[idx] for idx, element in enumerate(use_ses)], [])
        else:
            use_ses = [False] * sum(layers[:])
        exp_ratios = [1] * layers[0] + [6] * sum(layers[1:])

        self.depth = sum(layers[:]) * 3
        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
        inplanes = initial_channels / width_mult if width_mult < 1.0 else initial_channels

        features = []
        in_channels_group = []
        out_channels_group = []

        for i in range(self.depth // 3):
            if i == 0:
                in_channels_group.append(int(round(stem_channel * width_mult)))
                out_channels_group.append(int(round(inplanes * width_mult)))
            else:
                in_channels_group.append(int(round(inplanes * width_mult)))
                inplanes += fi_channels / (self.depth // 3 * 1.0)
                out_channels_group.append(int(round(inplanes * width_mult)))

        stem_chs = make_divisible(round(stem_channel * width_mult), divisor=ch_div)
        self.stem = Conv2dNormActivation(in_channels, stem_chs, stride=2, padding=1, activation=act_layer)

        feat_chs = [stem_chs]
        self.feature_info = []
        curr_stride = 2
        features = []
        num_blocks = len(in_channels_group)
        for block_idx, (in_c, out_c, exp_ratio, stride, use_se) in enumerate(
            zip(in_channels_group, out_channels_group, exp_ratios, strides, use_ses)
        ):
            if stride > 1:
                fname = "stem" if block_idx == 0 else f"features.{block_idx - 1}"
                self.feature_info += [dict(chs=feat_chs[-1], reduction=curr_stride, name=fname)]
            block_dpr = drop_path_rate * block_idx / (num_blocks - 1)  # stochastic depth linear decay rule
            drop_path = DropPath(block_dpr) if block_dpr > 0. else None
            features.append(LinearBottleneck(in_channels=in_c,
                                             out_channels=out_c,
                                             exp_ratio=exp_ratio,
                                             stride=stride,
                                             use_se=use_se,
                                             se_ratio=se_ratio,
                                             act_layer=act_layer,
                                             dw_act_layer=dw_act_layer,
                                             drop_path=drop_path))
            curr_stride *= stride
            feat_chs.append(out_c)

        pen_channels = make_divisible(int(1280 * width_mult), divisor=ch_div)
        self.feature_info += [dict(chs=feat_chs[-1], reduction=curr_stride, name=f'features.{len(features) - 1}')]
        self.flatten_sequential = True
        features.append(Conv2dNormActivation(out_channels_group[-1],
                                             pen_channels,
                                             kernel_size=1,
                                             activation=act_layer))

        features.append(GlobalAvgPooling(keep_dims=True))
        self.useconv = cls_useconv
        self.features = nn.SequentialCell(*features)
        if self.useconv:
            self.cls = nn.SequentialCell(
                Dropout(p=drop_rate),
                nn.Conv2d(pen_channels, num_classes, 1, has_bias=True))
        else:
            self.cls = nn.SequentialCell(
                Dropout(p=drop_rate),
                nn.Dense(pen_channels, num_classes))
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, (nn.Conv2d, nn.Dense)):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer(init.HeUniform(math.sqrt(5), mode="fan_in", nonlinearity="leaky_relu"),
                                         [1, cell.bias.shape[0]], cell.bias.dtype).reshape((-1)))

    def forward_features(self, x):
        x = self.stem(x)
        x = self.features(x)
        return x

    def forward_head(self, x):
        if not self.useconv:
            x = x.reshape((x.shape[0], -1))
            x = self.cls(x)
        else:
            x = self.cls(x).reshape((x.shape[0], -1))
        return x

    def construct(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.rexnet.rexnet_09(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ReXNet model with width multiplier of 0.9. Refer to the base class models.ReXNetV1 for more details.

Source code in mindcv\models\rexnet.py
269
270
271
272
273
274
@register_model
def rexnet_09(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ReXNetV1:
    """Get ReXNet model with width multiplier of 0.9.
    Refer to the base class `models.ReXNetV1` for more details.
    """
    return _rexnet("rexnet_09", 0.9, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.rexnet.rexnet_10(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ReXNet model with width multiplier of 1.0. Refer to the base class models.ReXNetV1 for more details.

Source code in mindcv\models\rexnet.py
277
278
279
280
281
282
@register_model
def rexnet_10(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ReXNetV1:
    """Get ReXNet model with width multiplier of 1.0.
    Refer to the base class `models.ReXNetV1` for more details.
    """
    return _rexnet("rexnet_10", 1.0, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.rexnet.rexnet_13(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ReXNet model with width multiplier of 1.3. Refer to the base class models.ReXNetV1 for more details.

Source code in mindcv\models\rexnet.py
285
286
287
288
289
290
@register_model
def rexnet_13(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ReXNetV1:
    """Get ReXNet model with width multiplier of 1.3.
    Refer to the base class `models.ReXNetV1` for more details.
    """
    return _rexnet("rexnet_13", 1.3, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.rexnet.rexnet_15(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ReXNet model with width multiplier of 1.5. Refer to the base class models.ReXNetV1 for more details.

Source code in mindcv\models\rexnet.py
293
294
295
296
297
298
@register_model
def rexnet_15(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ReXNetV1:
    """Get ReXNet model with width multiplier of 1.5.
    Refer to the base class `models.ReXNetV1` for more details.
    """
    return _rexnet("rexnet_15", 1.5, in_channels, num_classes, pretrained, **kwargs)

mindcv.models.rexnet.rexnet_20(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ReXNet model with width multiplier of 2.0. Refer to the base class models.ReXNetV1 for more details.

Source code in mindcv\models\rexnet.py
301
302
303
304
305
306
@register_model
def rexnet_20(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ReXNetV1:
    """Get ReXNet model with width multiplier of 2.0.
    Refer to the base class `models.ReXNetV1` for more details.
    """
    return _rexnet("rexnet_20", 2.0, in_channels, num_classes, pretrained, **kwargs)

senet

mindcv.models.senet

MindSpore implementation of SENet. Refer to Squeeze-and-Excitation Networks.

mindcv.models.senet.Bottleneck

Bases: Cell

Define the base block class for [SEnet, SEResNet, SEResNext] bottlenecks that implements construct method.

Source code in mindcv\models\senet.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class Bottleneck(nn.Cell):
    """
    Define the base block class for [SEnet, SEResNet, SEResNext] bottlenecks
    that implements `construct` method.
    """

    def construct(self, x: Tensor) -> Tensor:
        shortcut = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            shortcut = self.downsample(x)

        out = self.se_module(out) + shortcut
        out = self.relu(out)

        return out

mindcv.models.senet.SEBottleneck

Bases: Bottleneck

Define the Bottleneck for SENet154.

Source code in mindcv\models\senet.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class SEBottleneck(Bottleneck):
    """
    Define the Bottleneck for SENet154.
    """

    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        group: int,
        reduction: int,
        stride: int = 1,
        downsample: Optional[nn.SequentialCell] = None,
    ) -> None:
        super(SEBottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, channels * 2, kernel_size=1, pad_mode="pad",
                               padding=0, has_bias=False)
        self.bn1 = nn.BatchNorm2d(channels * 2)
        self.conv2 = nn.Conv2d(channels * 2, channels * 4, kernel_size=3, stride=stride,
                               pad_mode="pad", padding=1, group=group, has_bias=False)
        self.bn2 = nn.BatchNorm2d(channels * 4)
        self.conv3 = nn.Conv2d(channels * 4, channels * 4, kernel_size=1, pad_mode="pad",
                               padding=0, has_bias=False)
        self.bn3 = nn.BatchNorm2d(channels * 4)
        self.relu = nn.ReLU()
        self.se_module = SqueezeExciteV2(channels * 4, rd_ratio=1.0 / reduction)
        self.downsample = downsample
        self.stride = stride

mindcv.models.senet.SENet

Bases: Cell

SENet model class, based on "Squeeze-and-Excitation Networks" <https://arxiv.org/abs/1709.01507>_

PARAMETER DESCRIPTION
block

block class of SENet.

TYPE: Type[Union[SEBottleneck, SEResNetBottleneck, SEResNetBlock, SEResNeXtBottleneck]]

layers

Number of residual blocks for 4 layers.

TYPE: List[int]

group

Number of groups for the conv in each bottleneck block.

TYPE: int

reduction

Reduction ratio for Squeeze-and-Excitation modules.

TYPE: int

drop_rate

Drop probability for the Dropout layer. Default: 0.

TYPE: float DEFAULT: 0.0

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

inplanes

Number of input channels for layer1. Default: 64.

TYPE: int DEFAULT: 64

input3x3

If True, use three 3x3 convolutions in layer0. Default: False.

TYPE: bool DEFAULT: False

downsample_kernel_size

Kernel size for downsampling convolutions. Default: 1.

TYPE: int DEFAULT: 1

downsample_padding

Padding for downsampling convolutions. Default: 0.

TYPE: int DEFAULT: 0

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

Source code in mindcv\models\senet.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
class SENet(nn.Cell):
    r"""SENet model class, based on
    `"Squeeze-and-Excitation Networks" <https://arxiv.org/abs/1709.01507>`_

    Args:
        block: block class of SENet.
        layers: Number of residual blocks for 4 layers.
        group: Number of groups for the conv in each bottleneck block.
        reduction: Reduction ratio for Squeeze-and-Excitation modules.
        drop_rate: Drop probability for the Dropout layer. Default: 0.
        in_channels: number the channels of the input. Default: 3.
        inplanes:  Number of input channels for layer1. Default: 64.
        input3x3: If `True`, use three 3x3 convolutions in layer0. Default: False.
        downsample_kernel_size: Kernel size for downsampling convolutions. Default: 1.
        downsample_padding: Padding for downsampling convolutions. Default: 0.
        num_classes (int): number of classification classes. Default: 1000.
    """

    def __init__(
        self,
        block: Type[Union[SEBottleneck, SEResNetBottleneck, SEResNetBlock, SEResNeXtBottleneck]],
        layers: List[int],
        group: int,
        reduction: int,
        drop_rate: float = 0.0,
        in_channels: int = 3,
        inplanes: int = 64,
        input3x3: bool = False,
        downsample_kernel_size: int = 1,
        downsample_padding: int = 0,
        num_classes: int = 1000,
    ) -> None:
        super(SENet, self).__init__()
        self.inplanes = inplanes
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        if input3x3:
            self.layer0 = nn.SequentialCell([
                nn.Conv2d(in_channels, 64, 3, stride=2, pad_mode="pad", padding=1, has_bias=False),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.Conv2d(64, 64, 3, stride=1, pad_mode="pad", padding=1, has_bias=False),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.Conv2d(64, inplanes, 3, stride=1, pad_mode="pad", padding=1, has_bias=False),
                nn.BatchNorm2d(inplanes),
                nn.ReLU()
            ])
        else:
            self.layer0 = nn.SequentialCell([
                nn.Conv2d(in_channels, inplanes, kernel_size=7, stride=2, pad_mode="pad",
                          padding=3, has_bias=False),
                nn.BatchNorm2d(inplanes),
                nn.ReLU()
            ])
        self.pool0 = nn.MaxPool2d(3, stride=2, pad_mode="same")

        self.layer1 = self._make_layer(block, planes=64, blocks=layers[0], group=group,
                                       reduction=reduction, downsample_kernel_size=1,
                                       downsample_padding=0)

        self.layer2 = self._make_layer(block, planes=128, blocks=layers[1], stride=2,
                                       group=group, reduction=reduction,
                                       downsample_kernel_size=downsample_kernel_size,
                                       downsample_padding=downsample_padding)

        self.layer3 = self._make_layer(block, planes=256, blocks=layers[2], stride=2,
                                       group=group, reduction=reduction,
                                       downsample_kernel_size=downsample_kernel_size,
                                       downsample_padding=downsample_padding)

        self.layer4 = self._make_layer(block, planes=512, blocks=layers[3], stride=2,
                                       group=group, reduction=reduction,
                                       downsample_kernel_size=downsample_kernel_size,
                                       downsample_padding=downsample_padding)

        self.num_features = 512 * block.expansion

        self.pool = GlobalAvgPooling()
        if self.drop_rate > 0.:
            self.dropout = Dropout(p=self.drop_rate)
        self.classifier = nn.Dense(self.num_features, self.num_classes)

        self._initialize_weights()

    def _make_layer(
        self,
        block: Type[Union[SEBottleneck, SEResNetBottleneck, SEResNetBlock, SEResNeXtBottleneck]],
        planes: int,
        blocks: int,
        group: int,
        reduction: int,
        stride: int = 1,
        downsample_kernel_size: int = 1,
        downsample_padding: int = 0,
    ) -> nn.SequentialCell:
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.SequentialCell([
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=downsample_kernel_size,
                          stride=stride, pad_mode="pad", padding=downsample_padding, has_bias=False),
                nn.BatchNorm2d(planes * block.expansion)
            ])

        layers = [block(self.inplanes, planes, group, reduction, stride, downsample)]
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, group, reduction))

        return nn.SequentialCell(layers)

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.HeUniform(mode="fan_in", nonlinearity="sigmoid"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.layer0(x)
        x = self.pool0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.pool(x)
        if self.drop_rate > 0.0:
            x = self.dropout(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.senet.SEResNeXtBottleneck

Bases: Bottleneck

Define the ResNeXt bottleneck type C with a Squeeze-and-Excitation module.

Source code in mindcv\models\senet.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
class SEResNeXtBottleneck(Bottleneck):
    """
    Define the ResNeXt bottleneck type C with a Squeeze-and-Excitation module.
    """

    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        group: int,
        reduction: int,
        stride: int = 1,
        downsample: Optional[nn.SequentialCell] = None,
        base_width: int = 4,
    ) -> None:
        super(SEResNeXtBottleneck, self).__init__()
        width = math.floor(channels * (base_width / 64)) * group
        self.conv1 = nn.Conv2d(in_channels, width, kernel_size=1, stride=1, pad_mode="pad",
                               padding=0, has_bias=False)
        self.bn1 = nn.BatchNorm2d(width)
        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride, pad_mode="pad",
                               padding=1, group=group, has_bias=False)
        self.bn2 = nn.BatchNorm2d(width)
        self.conv3 = nn.Conv2d(width, channels * 4, kernel_size=1, pad_mode="pad", padding=0,
                               has_bias=False)
        self.bn3 = nn.BatchNorm2d(channels * 4)
        self.relu = nn.ReLU()
        self.se_module = SqueezeExciteV2(channels * 4, rd_ratio=1.0 / reduction)
        self.downsample = downsample
        self.stride = stride

mindcv.models.senet.SEResNetBlock

Bases: Cell

Define the basic block of resnet with a Squeeze-and-Excitation module.

Source code in mindcv\models\senet.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
class SEResNetBlock(nn.Cell):
    """
    Define the basic block of resnet with a Squeeze-and-Excitation module.
    """

    expansion = 1

    def __init__(
        self,
        in_channels: int,
        channels: int,
        group: int,
        reduction: int,
        stride: int = 1,
        downsample: Optional[nn.SequentialCell] = None,
    ) -> None:
        super(SEResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, channels, kernel_size=3, stride=stride, pad_mode="pad",
                               padding=1, has_bias=False)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, pad_mode="pad", padding=1,
                               group=group, has_bias=False)
        self.bn2 = nn.BatchNorm2d(channels)
        self.relu = nn.ReLU()
        self.se_module = SqueezeExciteV2(channels, rd_ratio=1.0 / reduction)
        self.downsample = downsample
        self.stride = stride

    def construct(self, x: Tensor) -> Tensor:
        shortcut = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            shortcut = self.downsample(x)

        out = self.se_module(out) + shortcut
        out = self.relu(out)

        return out

mindcv.models.senet.SEResNetBottleneck

Bases: Bottleneck

Define the ResNet bottleneck with a Squeeze-and-Excitation module, and the latter is used in the torchvision implementation of ResNet.

Source code in mindcv\models\senet.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class SEResNetBottleneck(Bottleneck):
    """
    Define the ResNet bottleneck with a Squeeze-and-Excitation module,
    and the latter is used in the torchvision implementation of ResNet.
    """

    expansion: int = 4

    def __init__(
        self,
        in_channels: int,
        channels: int,
        group: int,
        reduction: int,
        stride: int = 1,
        downsample: Optional[nn.SequentialCell] = None,
    ) -> None:
        super(SEResNetBottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, channels, kernel_size=1, pad_mode="pad",
                               padding=0, has_bias=False)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, stride=stride, pad_mode="pad",
                               padding=1, group=group, has_bias=False)
        self.bn2 = nn.BatchNorm2d(channels)
        self.conv3 = nn.Conv2d(channels, channels * 4, kernel_size=1, pad_mode="pad", padding=0,
                               has_bias=False)
        self.bn3 = nn.BatchNorm2d(channels * 4)
        self.relu = nn.ReLU()
        self.se_module = SqueezeExciteV2(channels * 4, rd_ratio=1.0 / reduction)
        self.downsample = downsample
        self.stride = stride

shufflenetv1

mindcv.models.shufflenetv1

MindSpore implementation of ShuffleNetV1. Refer to ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices

mindcv.models.shufflenetv1.GroupConv

Bases: Cell

Group convolution operation. Due to MindSpore doesn't support group conv in shufflenet, we need to define the group convolution manually, instead of using the origin nn.Conv2d by changing the parameter group.

PARAMETER DESCRIPTION
in_channels

Input channels of feature map.

TYPE: int

out_channels

Output channels of feature map.

TYPE: int

kernel_size

Size of convolution kernel.

TYPE: int

stride

Stride size for the group convolution layer.

TYPE: int

pad_mode

Specifies padding mode.

TYPE: str DEFAULT: 'pad'

pad

The number of padding on the height and width directions of the input.

TYPE: int DEFAULT: 0

groups

Splits filter into groups, in_channels and out_channels must be divisible by group.

TYPE: int DEFAULT: 1

has_bias

Whether the Conv2d layer has a bias parameter.

TYPE: bool DEFAULT: False

Source code in mindcv\models\shufflenetv1.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
class GroupConv(nn.Cell):
    """
    Group convolution operation.
    Due to MindSpore doesn't support group conv in shufflenet, we need to define the group convolution manually, instead
    of using the origin nn.Conv2d by changing the parameter `group`.

    Args:
        in_channels (int): Input channels of feature map.
        out_channels (int): Output channels of feature map.
        kernel_size (int): Size of convolution kernel.
        stride (int): Stride size for the group convolution layer.
        pad_mode (str): Specifies padding mode.
        pad (int): The number of padding on the height and width directions of the input.
        groups (int): Splits filter into groups, `in_channels` and `out_channels` must be divisible by `group`.
        has_bias (bool): Whether the Conv2d layer has a bias parameter.
    """

    def __init__(self, in_channels, out_channels, kernel_size, stride, pad_mode="pad", pad=0, groups=1, has_bias=False):
        super(GroupConv, self).__init__()
        assert in_channels % groups == 0 and out_channels % groups == 0
        self.groups = groups
        self.convs = nn.CellList()
        self.split = Split(split_size_or_sections=in_channels // groups, output_num=self.groups, axis=1)
        for _ in range(groups):
            self.convs.append(
                nn.Conv2d(
                    in_channels // groups,
                    out_channels // groups,
                    kernel_size=kernel_size,
                    stride=stride,
                    has_bias=has_bias,
                    padding=pad,
                    pad_mode=pad_mode,
                )
            )

    def construct(self, x):
        features = self.split(x)
        outputs = ()
        for i in range(self.groups):
            outputs = outputs + (self.convs[i](features[i]),)
        out = ops.concat(outputs, axis=1)
        return out

mindcv.models.shufflenetv1.ShuffleNetV1

Bases: Cell

ShuffleNetV1 model class, based on "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" <https://arxiv.org/abs/1707.01083>_ # noqa: E501

PARAMETER DESCRIPTION
num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number of input channels. Default: 3.

TYPE: int DEFAULT: 3

model_size

scale factor which controls the number of channels. Default: '2.0x'.

TYPE: str DEFAULT: '2.0x'

group

number of group for group convolution. Default: 3.

TYPE: int DEFAULT: 3

Source code in mindcv\models\shufflenetv1.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
class ShuffleNetV1(nn.Cell):
    r"""ShuffleNetV1 model class, based on
    `"ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" <https://arxiv.org/abs/1707.01083>`_  # noqa: E501

    Args:
        num_classes: number of classification classes. Default: 1000.
        in_channels: number of input channels. Default: 3.
        model_size: scale factor which controls the number of channels. Default: '2.0x'.
        group: number of group for group convolution. Default: 3.
    """

    def __init__(
        self,
        num_classes: int = 1000,
        in_channels: int = 3,
        model_size: str = "2.0x",
        group: int = 3,
    ):
        super().__init__()
        self.stage_repeats = [4, 8, 4]
        self.model_size = model_size
        if group == 3:
            if model_size == "0.5x":
                self.stage_out_channels = [-1, 12, 120, 240, 480]
            elif model_size == "1.0x":
                self.stage_out_channels = [-1, 24, 240, 480, 960]
            elif model_size == "1.5x":
                self.stage_out_channels = [-1, 24, 360, 720, 1440]
            elif model_size == "2.0x":
                self.stage_out_channels = [-1, 48, 480, 960, 1920]
            else:
                raise NotImplementedError
        elif group == 8:
            if model_size == "0.5x":
                self.stage_out_channels = [-1, 16, 192, 384, 768]
            elif model_size == "1.0x":
                self.stage_out_channels = [-1, 24, 384, 768, 1536]
            elif model_size == "1.5x":
                self.stage_out_channels = [-1, 24, 576, 1152, 2304]
            elif model_size == "2.0x":
                self.stage_out_channels = [-1, 48, 768, 1536, 3072]
            else:
                raise NotImplementedError

        # building first layer
        input_channel = self.stage_out_channels[1]
        self.first_conv = nn.SequentialCell(
            nn.Conv2d(in_channels, input_channel, kernel_size=3, stride=2, pad_mode="pad", padding=1),
            nn.BatchNorm2d(input_channel),
            nn.ReLU(),
        )
        self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")

        features = []
        for idxstage, numrepeat in enumerate(self.stage_repeats):
            output_channel = self.stage_out_channels[idxstage + 2]
            for i in range(numrepeat):
                stride = 2 if i == 0 else 1
                first_group = idxstage == 0 and i == 0
                features.append(ShuffleV1Block(input_channel, output_channel,
                                               group=group, first_group=first_group,
                                               mid_channels=output_channel // 4, stride=stride))
                input_channel = output_channel

        self.features = nn.SequentialCell(features)
        self.global_pool = GlobalAvgPooling()
        self.classifier = nn.Dense(self.stage_out_channels[-1], num_classes, has_bias=False)
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights for cells."""
        for name, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                if "first" in name:
                    cell.weight.set_data(
                        init.initializer(init.Normal(0.01, 0), cell.weight.shape, cell.weight.dtype))
                else:
                    cell.weight.set_data(
                        init.initializer(init.Normal(1.0 / cell.weight.shape[1], 0), cell.weight.shape,
                                         cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(0.01, 0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.first_conv(x)
        x = self.max_pool(x)
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.global_pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.shufflenetv1.ShuffleV1Block

Bases: Cell

Basic block of ShuffleNetV1. 1x1 GC -> CS -> 3x3 DWC -> 1x1 GC

Source code in mindcv\models\shufflenetv1.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
class ShuffleV1Block(nn.Cell):
    """Basic block of ShuffleNetV1. 1x1 GC -> CS -> 3x3 DWC -> 1x1 GC"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        mid_channels: int,
        stride: int,
        group: int,
        first_group: bool,
    ) -> None:
        super().__init__()
        assert stride in [1, 2]
        self.stride = stride
        self.group = group

        if stride == 2:
            out_channels = out_channels - in_channels

        branch_main_1 = [
            # pw
            GroupConv(
                in_channels=in_channels,
                out_channels=mid_channels,
                kernel_size=1,
                stride=1,
                pad_mode="pad",
                pad=0,
                groups=1 if first_group else group,
            ),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(),
        ]

        branch_main_2 = [
            # dw
            nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=stride, pad_mode="pad", padding=1,
                      group=mid_channels),
            nn.BatchNorm2d(mid_channels),
            # pw-linear
            GroupConv(
                in_channels=mid_channels,
                out_channels=out_channels,
                kernel_size=1,
                stride=1,
                pad_mode="pad",
                pad=0,
                groups=group,
            ),
            nn.BatchNorm2d(out_channels),
        ]
        self.branch_main_1 = nn.SequentialCell(branch_main_1)
        self.branch_main_2 = nn.SequentialCell(branch_main_2)
        if stride == 2:
            self.branch_proj = nn.AvgPool2d(kernel_size=3, stride=2, pad_mode="same")

        self.relu = nn.ReLU()

    def construct(self, x: Tensor) -> Tensor:
        identify = x
        x = self.branch_main_1(x)
        if self.group > 1:
            x = self.channel_shuffle(x)
        x = self.branch_main_2(x)
        if self.stride == 1:
            out = self.relu(identify + x)
        else:
            out = self.relu(ops.concat((self.branch_proj(identify), x), axis=1))

        return out

    def channel_shuffle(self, x: Tensor) -> Tensor:
        batch_size, num_channels, height, width = x.shape

        group_channels = num_channels // self.group
        x = ops.reshape(x, (batch_size, group_channels, self.group, height, width))
        x = ops.transpose(x, (0, 2, 1, 3, 4))
        x = ops.reshape(x, (batch_size, num_channels, height, width))
        return x

mindcv.models.shufflenetv1.shufflenet_v1_g3_05(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 0.5 and 3 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
287
288
289
290
291
292
293
294
295
296
297
298
@register_model
def shufflenet_v1_g3_05(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 0.5 and 3 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g3_05"]
    model = ShuffleNetV1(group=3, model_size="0.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g3_10(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 1.0 and 3 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
301
302
303
304
305
306
307
308
309
310
311
312
@register_model
def shufflenet_v1_g3_10(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 1.0 and 3 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g3_10"]
    model = ShuffleNetV1(group=3, model_size="1.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g3_15(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 1.5 and 3 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
315
316
317
318
319
320
321
322
323
324
325
326
@register_model
def shufflenet_v1_g3_15(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 1.5 and 3 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g3_15"]
    model = ShuffleNetV1(group=3, model_size="1.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g3_20(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 2.0 and 3 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
329
330
331
332
333
334
335
336
337
338
339
340
@register_model
def shufflenet_v1_g3_20(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 2.0 and 3 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g3_20"]
    model = ShuffleNetV1(group=3, model_size="2.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g8_05(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 0.5 and 8 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
343
344
345
346
347
348
349
350
351
352
353
354
@register_model
def shufflenet_v1_g8_05(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 0.5 and 8 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g8_05"]
    model = ShuffleNetV1(group=8, model_size="0.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g8_10(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 1.0 and 8 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
357
358
359
360
361
362
363
364
365
366
367
368
@register_model
def shufflenet_v1_g8_10(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 1.0 and 8 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g8_10"]
    model = ShuffleNetV1(group=8, model_size="1.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g8_15(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 1.5 and 8 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
371
372
373
374
375
376
377
378
379
380
381
382
@register_model
def shufflenet_v1_g8_15(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 1.5 and 8 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g8_15"]
    model = ShuffleNetV1(group=8, model_size="1.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv1.shufflenet_v1_g8_20(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV1 model with width scaled by 2.0 and 8 groups of GPConv. Refer to the base class models.ShuffleNetV1 for more details.

Source code in mindcv\models\shufflenetv1.py
385
386
387
388
389
390
391
392
393
394
395
396
@register_model
def shufflenet_v1_g8_20(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV1:
    """Get ShuffleNetV1 model with width scaled by 2.0 and 8 groups of GPConv.
    Refer to the base class `models.ShuffleNetV1` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v1_g8_20"]
    model = ShuffleNetV1(group=8, model_size="2.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

shufflenetv2

mindcv.models.shufflenetv2

MindSpore implementation of ShuffleNetV2. Refer to ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design

mindcv.models.shufflenetv2.ShuffleNetV2

Bases: Cell

ShuffleNetV2 model class, based on "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" <https://arxiv.org/abs/1807.11164>_

PARAMETER DESCRIPTION
num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number of input channels. Default: 3.

TYPE: int DEFAULT: 3

model_size

scale factor which controls the number of channels. Default: '1.5x'.

TYPE: str DEFAULT: '1.5x'

Source code in mindcv\models\shufflenetv2.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
class ShuffleNetV2(nn.Cell):
    r"""ShuffleNetV2 model class, based on
    `"ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" <https://arxiv.org/abs/1807.11164>`_

    Args:
        num_classes: number of classification classes. Default: 1000.
        in_channels: number of input channels. Default: 3.
        model_size: scale factor which controls the number of channels. Default: '1.5x'.
    """

    def __init__(
        self,
        num_classes: int = 1000,
        in_channels: int = 3,
        model_size: str = "1.5x",
    ):
        super().__init__()

        self.stage_repeats = [4, 8, 4]
        self.model_size = model_size
        if model_size == "0.5x":
            self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
        elif model_size == "1.0x":
            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
        elif model_size == "1.5x":
            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
        elif model_size == "2.0x":
            self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
        else:
            raise NotImplementedError

        # building first layer
        input_channel = self.stage_out_channels[1]
        self.first_conv = nn.SequentialCell([
            nn.Conv2d(in_channels, input_channel, kernel_size=3, stride=2,
                      pad_mode="pad", padding=1),
            nn.BatchNorm2d(input_channel),
            nn.ReLU(),
        ])
        self.max_pool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")

        self.features = []
        for idxstage, numrepeat in enumerate(self.stage_repeats):
            output_channel = self.stage_out_channels[idxstage + 2]
            for i in range(numrepeat):
                if i == 0:
                    self.features.append(ShuffleV2Block(input_channel, output_channel,
                                                        mid_channels=output_channel // 2, kernel_size=3, stride=2))
                else:
                    self.features.append(ShuffleV2Block(input_channel // 2, output_channel,
                                                        mid_channels=output_channel // 2, kernel_size=3, stride=1))
                input_channel = output_channel

        self.features = nn.SequentialCell(self.features)

        self.conv_last = nn.SequentialCell([
            nn.Conv2d(input_channel, self.stage_out_channels[-1], kernel_size=1, stride=1),
            nn.BatchNorm2d(self.stage_out_channels[-1]),
            nn.ReLU()
        ])
        self.pool = GlobalAvgPooling()
        self.classifier = nn.Dense(self.stage_out_channels[-1], num_classes, has_bias=False)
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights for cells."""
        for name, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                if "first" in name:
                    cell.weight.set_data(
                        init.initializer(init.Normal(0.01, 0), cell.weight.shape, cell.weight.dtype))
                else:
                    cell.weight.set_data(
                        init.initializer(init.Normal(1.0 / cell.weight.shape[1], 0), cell.weight.shape,
                                         cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(0.01, 0), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.first_conv(x)
        x = self.max_pool(x)
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.conv_last(x)
        x = self.pool(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.shufflenetv2.ShuffleV2Block

Bases: Cell

define the basic block of ShuffleV2

Source code in mindcv\models\shufflenetv2.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class ShuffleV2Block(nn.Cell):
    """define the basic block of ShuffleV2"""

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        mid_channels: int,
        kernel_size: int,
        stride: int,
    ) -> None:
        super().__init__()
        assert stride in [1, 2]
        self.stride = stride
        pad = kernel_size // 2
        out_channels = out_channels - in_channels
        branch_main = [
            # pw
            nn.Conv2d(in_channels, mid_channels, kernel_size=1, stride=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(),
            # dw
            nn.Conv2d(mid_channels, mid_channels, kernel_size=kernel_size, stride=stride,
                      pad_mode="pad", padding=pad, group=mid_channels),
            nn.BatchNorm2d(mid_channels),
            # pw-linear
            nn.Conv2d(mid_channels, out_channels, kernel_size=1, stride=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
        ]
        self.branch_main = nn.SequentialCell(branch_main)

        if stride == 2:
            branch_proj = [
                # dw
                nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=stride,
                          pad_mode="pad", padding=pad, group=in_channels),
                nn.BatchNorm2d(in_channels),
                # pw-linear
                nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1),
                nn.BatchNorm2d(in_channels),
                nn.ReLU(),
            ]
            self.branch_proj = nn.SequentialCell(branch_proj)
        else:
            self.branch_proj = None

    def construct(self, old_x: Tensor) -> Tensor:
        if self.stride == 1:
            x_proj, x = self.channel_shuffle(old_x)
            return ops.concat((x_proj, self.branch_main(x)), axis=1)

        if self.stride == 2:
            x_proj = old_x
            x = old_x
            return ops.concat((self.branch_proj(x_proj), self.branch_main(x)), axis=1)
        return None

    @staticmethod
    def channel_shuffle(x: Tensor) -> Tuple[Tensor, Tensor]:
        batch_size, num_channels, height, width = x.shape
        x = ops.reshape(x, (batch_size * num_channels // 2, 2, height * width,))
        x = ops.transpose(x, (1, 0, 2,))
        x = ops.reshape(x, (2, -1, num_channels // 2, height, width,))
        return x[0], x[1]

mindcv.models.shufflenetv2.shufflenet_v2_x0_5(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV2 model with width scaled by 0.5. Refer to the base class models.ShuffleNetV2 for more details.

Source code in mindcv\models\shufflenetv2.py
220
221
222
223
224
225
226
227
228
229
230
231
@register_model
def shufflenet_v2_x0_5(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV2:
    """Get ShuffleNetV2 model with width scaled by 0.5.
    Refer to the base class `models.ShuffleNetV2` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v2_x0_5"]
    model = ShuffleNetV2(model_size="0.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv2.shufflenet_v2_x1_0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV2 model with width scaled by 1.0. Refer to the base class models.ShuffleNetV2 for more details.

Source code in mindcv\models\shufflenetv2.py
234
235
236
237
238
239
240
241
242
243
244
245
@register_model
def shufflenet_v2_x1_0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV2:
    """Get ShuffleNetV2 model with width scaled by 1.0.
    Refer to the base class `models.ShuffleNetV2` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v2_x1_0"]
    model = ShuffleNetV2(model_size="1.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv2.shufflenet_v2_x1_5(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV2 model with width scaled by 1.5. Refer to the base class models.ShuffleNetV2 for more details.

Source code in mindcv\models\shufflenetv2.py
248
249
250
251
252
253
254
255
256
257
258
259
@register_model
def shufflenet_v2_x1_5(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV2:
    """Get ShuffleNetV2 model with width scaled by 1.5.
    Refer to the base class `models.ShuffleNetV2` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v2_x1_5"]
    model = ShuffleNetV2(model_size="1.5x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.shufflenetv2.shufflenet_v2_x2_0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get ShuffleNetV2 model with width scaled by 2.0. Refer to the base class models.ShuffleNetV2 for more details.

Source code in mindcv\models\shufflenetv2.py
262
263
264
265
266
267
268
269
270
271
272
273
@register_model
def shufflenet_v2_x2_0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ShuffleNetV2:
    """Get ShuffleNetV2 model with width scaled by 2.0.
    Refer to the base class `models.ShuffleNetV2` for more details.
    """
    default_cfg = default_cfgs["shufflenet_v2_x2_0"]
    model = ShuffleNetV2(model_size="2.0x", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

sknet

mindcv.models.sknet

MindSpore implementation of SKNet. Refer to Selective Kernel Networks.

mindcv.models.sknet.SKNet

Bases: ResNet

SKNet model class, based on "Selective Kernel Networks" <https://arxiv.org/abs/1903.06586>_

PARAMETER DESCRIPTION
block

block of sknet.

TYPE: Type[Cell]

layers

number of layers of each stage.

TYPE: List[int]

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

groups

number of groups for group conv in blocks. Default: 1.

TYPE: int DEFAULT: 1

base_width

base width of pre group hidden channel in blocks. Default: 64.

TYPE: int DEFAULT: 64

norm

normalization layer in blocks. Default: None.

TYPE: Optional[Cell] DEFAULT: None

sk_kwargs

kwargs of selective kernel. Default: None.

TYPE: Optional[Dict] DEFAULT: None

Source code in mindcv\models\sknet.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
class SKNet(ResNet):
    r"""SKNet model class, based on
    `"Selective Kernel Networks" <https://arxiv.org/abs/1903.06586>`_

    Args:
        block: block of sknet.
        layers: number of layers of each stage.
        num_classes: number of classification classes. Default: 1000.
        in_channels: number the channels of the input. Default: 3.
        groups: number of groups for group conv in blocks. Default: 1.
        base_width: base width of pre group hidden channel in blocks. Default: 64.
        norm: normalization layer in blocks. Default: None.
        sk_kwargs: kwargs of selective kernel. Default: None.
    """

    def __init__(
        self,
        block: Type[nn.Cell],
        layers: List[int],
        num_classes: int = 1000,
        in_channels: int = 3,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        sk_kwargs: Optional[Dict] = None,
    ) -> None:
        self.sk_kwargs: Optional[Dict] = sk_kwargs  # make pylint happy
        super().__init__(block, layers, num_classes, in_channels, groups, base_width, norm)

    def _make_layer(
        self,
        block: Type[Union[SelectiveKernelBasic, SelectiveKernelBottleneck]],
        channels: int,
        block_nums: int,
        stride: int = 1,
    ) -> nn.SequentialCell:
        down_sample = None

        if stride != 1 or self.input_channels != channels * block.expansion:
            down_sample = nn.SequentialCell([
                nn.Conv2d(self.input_channels, channels * block.expansion, kernel_size=1, stride=stride),
                self.norm(channels * block.expansion)
            ])

        layers = []
        layers.append(
            block(
                self.input_channels,
                channels,
                stride=stride,
                down_sample=down_sample,
                groups=self.groups,
                base_width=self.base_with,
                norm=self.norm,
                sk_kwargs=self.sk_kwargs,
            )
        )
        self.input_channels = channels * block.expansion

        for _ in range(1, block_nums):
            layers.append(
                block(
                    self.input_channels,
                    channels,
                    groups=self.groups,
                    base_width=self.base_with,
                    norm=self.norm,
                    sk_kwargs=self.sk_kwargs,
                )
            )

        return nn.SequentialCell(layers)

mindcv.models.sknet.SelectiveKernelBasic

Bases: Cell

build basic block of sknet

Source code in mindcv\models\sknet.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class SelectiveKernelBasic(nn.Cell):
    """build basic block of sknet"""

    expansion = 1

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        groups: int = 1,
        down_sample: Optional[nn.Cell] = None,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        sk_kwargs: Optional[Dict] = None,
    ):
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d

        if sk_kwargs is None:
            sk_kwargs = {}

        assert groups == 1, "BasicBlock only supports cardinality of 1"
        assert base_width == 64, "BasicBlock doest not support changing base width"

        self.conv1 = SelectiveKernel(
            in_channels, out_channels, stride=stride, **sk_kwargs)
        self.conv2 = nn.SequentialCell([
            nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=3, padding=1, pad_mode="pad"),
            norm(out_channels * self.expansion)
        ])

        self.relu = nn.ReLU()
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.conv2(out)

        if self.down_sample is not None:
            identity = self.down_sample(x)
        out += identity
        out = self.relu(out)
        return out

mindcv.models.sknet.SelectiveKernelBottleneck

Bases: Cell

build the bottleneck of the sknet

Source code in mindcv\models\sknet.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class SelectiveKernelBottleneck(nn.Cell):
    """build the bottleneck of the sknet"""

    expansion = 4

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        down_sample: Optional[nn.Cell] = None,
        groups: int = 1,
        base_width: int = 64,
        norm: Optional[nn.Cell] = None,
        sk_kwargs: Optional[Dict] = None,
    ):
        super().__init__()
        if norm is None:
            norm = nn.BatchNorm2d

        if sk_kwargs is None:
            sk_kwargs = {}

        width = int(out_channels * (base_width / 64.0)) * groups
        self.conv1 = nn.SequentialCell([
            nn.Conv2d(in_channels, width, kernel_size=1),
            norm(width)
        ])
        self.conv2 = SelectiveKernel(
            width, width, stride=stride, groups=groups, **sk_kwargs)
        self.conv3 = nn.SequentialCell([
            nn.Conv2d(width, out_channels * self.expansion, kernel_size=1),
            norm(out_channels * self.expansion)
        ])

        self.relu = nn.ReLU()
        self.down_sample = down_sample

    def construct(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)

        if self.down_sample:
            identity = self.down_sample(x)
        out += identity
        out = self.relu(out)
        return out

mindcv.models.sknet.skresnet18(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 18 layers SKNet model. Refer to the base class models.SKNet for more details.

Source code in mindcv\models\sknet.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
@register_model
def skresnet18(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ResNet:
    """Get 18 layers SKNet model.
    Refer to the base class `models.SKNet` for more details.
    """
    default_cfg = default_cfgs["skresnet18"]
    sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True)
    model = SKNet(SelectiveKernelBasic, [2, 2, 2, 2], num_classes=num_classes, in_channels=in_channels,
                  sk_kwargs=sk_kwargs, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.sknet.skresnet34(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 34 layers SKNet model. Refer to the base class models.SKNet for more details.

Source code in mindcv\models\sknet.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
@register_model
def skresnet34(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ResNet:
    """Get 34 layers SKNet model.
    Refer to the base class `models.SKNet` for more details.
    """
    default_cfg = default_cfgs["skresnet34"]
    sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True)
    model = SKNet(SelectiveKernelBasic, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels,
                  sk_kwargs=sk_kwargs, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.sknet.skresnet50(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers SKNet model. Refer to the base class models.SKNet for more details.

Source code in mindcv\models\sknet.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
@register_model
def skresnet50(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ResNet:
    """Get 50 layers SKNet model.
    Refer to the base class `models.SKNet` for more details.
    """
    default_cfg = default_cfgs["skresnet50"]
    sk_kwargs = dict(split_input=True)
    model = SKNet(SelectiveKernelBottleneck, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels,
                  sk_kwargs=sk_kwargs, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.sknet.skresnext50_32x4d(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 50 layers SKNeXt model with 32 groups of GPConv. Refer to the base class models.SKNet for more details.

Source code in mindcv\models\sknet.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
@register_model
def skresnext50_32x4d(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> ResNet:
    """Get 50 layers SKNeXt model with 32 groups of GPConv.
    Refer to the base class `models.SKNet` for more details.
    """
    default_cfg = default_cfgs["skresnext50_32x4d"]
    sk_kwargs = dict(rd_ratio=1 / 16, rd_divisor=32, split_input=False)
    model = SKNet(SelectiveKernelBottleneck, [3, 4, 6, 3], num_classes=num_classes, in_channels=in_channels,
                  sk_kwargs=sk_kwargs, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

squeezenet

mindcv.models.squeezenet

MindSpore implementation of SqueezeNet. Refer to SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size.

mindcv.models.squeezenet.Fire

Bases: Cell

define the basic block of squeezenet

Source code in mindcv\models\squeezenet.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class Fire(nn.Cell):
    """define the basic block of squeezenet"""

    def __init__(
        self,
        in_channels: int,
        squeeze_channels: int,
        expand1x1_channels: int,
        expand3x3_channels: int,
    ) -> None:
        super().__init__()
        self.squeeze = nn.Conv2d(in_channels, squeeze_channels, kernel_size=1, has_bias=True)
        self.squeeze_activation = nn.ReLU()
        self.expand1x1 = nn.Conv2d(squeeze_channels, expand1x1_channels, kernel_size=1, has_bias=True)
        self.expand1x1_activation = nn.ReLU()
        self.expand3x3 = nn.Conv2d(squeeze_channels, expand3x3_channels, kernel_size=3, pad_mode="same", has_bias=True)
        self.expand3x3_activation = nn.ReLU()

    def construct(self, x: Tensor) -> Tensor:
        x = self.squeeze_activation(self.squeeze(x))
        return ops.concat((self.expand1x1_activation(self.expand1x1(x)),
                           self.expand3x3_activation(self.expand3x3(x))), axis=1)

mindcv.models.squeezenet.SqueezeNet

Bases: Cell

SqueezeNet model class, based on "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size" <https://arxiv.org/abs/1602.07360>_ # noqa: E501

.. note:: Important: In contrast to the other models the inception_v3 expects tensors with a size of N x 3 x 227 x 227, so ensure your images are sized accordingly.

PARAMETER DESCRIPTION
version

version of the architecture, '1_0' or '1_1'. Default: '1_0'.

TYPE: str DEFAULT: '1_0'

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

drop_rate

dropout rate of the classifier. Default: 0.5.

TYPE: float DEFAULT: 0.5

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

Source code in mindcv\models\squeezenet.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
class SqueezeNet(nn.Cell):
    r"""SqueezeNet model class, based on
    `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_  # noqa: E501

    .. note::
        **Important**: In contrast to the other models the inception_v3 expects tensors with a size of
        N x 3 x 227 x 227, so ensure your images are sized accordingly.

    Args:
        version: version of the architecture, '1_0' or '1_1'. Default: '1_0'.
        num_classes: number of classification classes. Default: 1000.
        drop_rate: dropout rate of the classifier. Default: 0.5.
        in_channels: number the channels of the input. Default: 3.
    """

    def __init__(
        self,
        version: str = "1_0",
        num_classes: int = 1000,
        drop_rate: float = 0.5,
        in_channels: int = 3,
    ) -> None:
        super().__init__()
        if version == "1_0":
            self.features = nn.SequentialCell([
                nn.Conv2d(in_channels, 96, kernel_size=7, stride=2, pad_mode="valid", has_bias=True),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(96, 16, 64, 64),
                Fire(128, 16, 64, 64),
                Fire(128, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(256, 32, 128, 128),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(512, 64, 256, 256),
            ])
        elif version == "1_1":
            self.features = nn.SequentialCell([
                nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1, pad_mode="pad", has_bias=True),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),
            ])
        else:
            raise ValueError(f"Unsupported SqueezeNet version {version}: 1_0 or 1_1 expected")

        self.final_conv = nn.Conv2d(512, num_classes, kernel_size=1, has_bias=True)
        self.classifier = nn.SequentialCell([
            Dropout(p=drop_rate),
            self.final_conv,
            nn.ReLU(),
            GlobalAvgPooling()
        ])
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                if cell is self.final_conv:
                    cell.weight.set_data(init.initializer(init.Normal(), cell.weight.shape, cell.weight.dtype))
                else:
                    cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.squeezenet.squeezenet1_0(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get SqueezeNet model of version 1.0. Refer to the base class models.SqueezeNet for more details.

Source code in mindcv\models\squeezenet.py
153
154
155
156
157
158
159
160
161
162
163
164
@register_model
def squeezenet1_0(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> SqueezeNet:
    """Get SqueezeNet model of version 1.0.
    Refer to the base class `models.SqueezeNet` for more details.
    """
    default_cfg = default_cfgs["squeezenet1_0"]
    model = SqueezeNet(version="1_0", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.squeezenet.squeezenet1_1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get SqueezeNet model of version 1.1. Refer to the base class models.SqueezeNet for more details.

Source code in mindcv\models\squeezenet.py
167
168
169
170
171
172
173
174
175
176
177
178
@register_model
def squeezenet1_1(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> SqueezeNet:
    """Get SqueezeNet model of version 1.1.
    Refer to the base class `models.SqueezeNet` for more details.
    """
    default_cfg = default_cfgs["squeezenet1_1"]
    model = SqueezeNet(version="1_1", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

swintransformer

mindcv.models.swintransformer

Define SwinTransformer model

mindcv.models.swintransformer.BasicLayer

Bases: Cell

A basic Swin Transformer layer for one stage.

PARAMETER DESCRIPTION
dim

Number of input channels.

TYPE: int

input_resolution

Input resolution.

TYPE: tuple[int]

depth

Number of blocks.

TYPE: int

num_heads

Number of attention heads.

TYPE: int

window_size

Local window size.

TYPE: int

mlp_ratio

Ratio of mlp hidden dim to embedding dim.

TYPE: float DEFAULT: 4.0

qkv_bias

If True, add a learnable bias to query, key, value. Default: True

TYPE: bool DEFAULT: True

qk_scale

Override default qk scale of head_dim ** -0.5 if set.

TYPE: float | None DEFAULT: None

drop

Dropout rate. Default: 0.0

TYPE: float DEFAULT: 0.0

attn_drop

Attention dropout rate. Default: 0.0

TYPE: float DEFAULT: 0.0

drop_path

Stochastic depth rate. Default: 0.0

TYPE: float | tuple[float] DEFAULT: 0.0

norm_layer

Normalization layer. Default: nn.LayerNorm

TYPE: Cell DEFAULT: LayerNorm

downsample

Downsample layer at the end of the layer. Default: None

TYPE: Cell | None DEFAULT: None

Source code in mindcv\models\swintransformer.py
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
class BasicLayer(nn.Cell):
    """A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resolution.
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Cell, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Cell | None, optional): Downsample layer at the end of the layer. Default: None
    """

    def __init__(
        self,
        dim: int,
        input_resolution: Tuple[int],
        depth: int,
        num_heads: int,
        window_size: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = True,
        qk_scale: Optional[float] = None,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        drop_path: Optional[float] = 0.0,
        norm_layer: Optional[nn.Cell] = nn.LayerNorm,
        downsample: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.depth = depth

        # build blocks
        self.blocks = nn.CellList([
            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
                                 num_heads=num_heads, window_size=window_size,
                                 shift_size=0 if (i % 2 == 0) else window_size // 2,  # TODO: 这里window_size//2的时候特别慢
                                 mlp_ratio=mlp_ratio,
                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
                                 drop=drop, attn_drop=attn_drop,
                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                                 norm_layer=norm_layer)
            for i in range(depth)])

        # patch merging layer
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
        else:
            self.downsample = None

    def construct(self, x: Tensor) -> Tensor:
        for blk in self.blocks:
            x = blk(x)
        if self.downsample is not None:
            x = self.downsample(x)
        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"

mindcv.models.swintransformer.PatchEmbed

Bases: Cell

Image to Patch Embedding

PARAMETER DESCRIPTION
image_size

Image size. Default: 224.

TYPE: int DEFAULT: 224

patch_size

Patch token size. Default: 4.

TYPE: int DEFAULT: 4

in_chans

Number of input image channels. Default: 3.

TYPE: int DEFAULT: 3

embed_dim

Number of linear projection output channels. Default: 96.

TYPE: int DEFAULT: 96

norm_layer

Normalization layer. Default: None

TYPE: Cell DEFAULT: None

Source code in mindcv\models\swintransformer.py
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
class PatchEmbed(nn.Cell):
    """Image to Patch Embedding

    Args:
        image_size (int): Image size.  Default: 224.
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Cell, optional): Normalization layer. Default: None
    """

    def __init__(
        self,
        image_size: int = 224,
        patch_size: int = 4,
        in_chans: int = 3,
        embed_dim: int = 96,
        norm_layer: Optional[nn.Cell] = None,
    ) -> None:
        super().__init__()
        image_size = to_2tuple(image_size)
        patch_size = to_2tuple(patch_size)
        patches_resolution = [image_size[0] // patch_size[0], image_size[1] // patch_size[1]]
        self.image_size = image_size
        self.patch_size = patch_size
        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2d(in_channels=in_chans, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size,
                              pad_mode="pad", has_bias=True, weight_init="TruncatedNormal")

        if norm_layer is not None:
            if isinstance(embed_dim, int):
                embed_dim = (embed_dim,)
            self.norm = norm_layer(embed_dim, epsilon=1e-5)
        else:
            self.norm = None

    def construct(self, x: Tensor) -> Tensor:
        b = x.shape[0]
        # FIXME look at relaxing size constraints
        x = ops.reshape(self.proj(x), (b, self.embed_dim, -1))  # b Ph*Pw c
        x = ops.transpose(x, (0, 2, 1))

        if self.norm is not None:
            x = self.norm(x)
        return x

mindcv.models.swintransformer.PatchMerging

Bases: Cell

Patch Merging Layer.

PARAMETER DESCRIPTION
input_resolution

Resolution of input feature.

TYPE: tuple[int]

dim

Number of input channels.

TYPE: int

norm_layer

Normalization layer. Default: nn.LayerNorm

TYPE: Module DEFAULT: LayerNorm

Source code in mindcv\models\swintransformer.py
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
class PatchMerging(nn.Cell):
    """Patch Merging Layer.

    Args:
        input_resolution (tuple[int]): Resolution of input feature.
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(
        self,
        input_resolution: Tuple[int],
        dim: int,
        norm_layer: Optional[nn.Cell] = nn.LayerNorm,
    ) -> None:
        super().__init__()
        self.input_resolution = input_resolution
        self.dim = dim[0] if isinstance(dim, tuple) and len(dim) == 1 else dim
        # Default False
        self.reduction = nn.Dense(in_channels=4 * dim, out_channels=2 * dim, has_bias=False)
        self.norm = norm_layer([dim * 4, ])
        self.H, self.W = self.input_resolution
        self.H_2, self.W_2 = self.H // 2, self.W // 2
        self.H2W2 = int(self.H * self.W // 4)
        self.dim_mul_4 = int(dim * 4)
        self.H2W2 = int(self.H * self.W // 4)

    def construct(self, x: Tensor) -> Tensor:
        """
        x: B, H*W, C
        """
        b = x.shape[0]
        x = ops.reshape(x, (b, self.H_2, 2, self.W_2, 2, self.dim))
        x = ops.transpose(x, (0, 1, 3, 4, 2, 5))
        x = ops.reshape(x, (b, self.H2W2, self.dim_mul_4))
        x = self.norm(x)
        x = self.reduction(x)

        return x

    def extra_repr(self) -> str:
        return f"input_resolution={self.input_resolution}, dim={self.dim}"
mindcv.models.swintransformer.PatchMerging.construct(x)
Source code in mindcv\models\swintransformer.py
429
430
431
432
433
434
435
436
437
438
439
440
def construct(self, x: Tensor) -> Tensor:
    """
    x: B, H*W, C
    """
    b = x.shape[0]
    x = ops.reshape(x, (b, self.H_2, 2, self.W_2, 2, self.dim))
    x = ops.transpose(x, (0, 1, 3, 4, 2, 5))
    x = ops.reshape(x, (b, self.H2W2, self.dim_mul_4))
    x = self.norm(x)
    x = self.reduction(x)

    return x

mindcv.models.swintransformer.SwinTransformer

Bases: Cell

SwinTransformer model class, based on "Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" <https://arxiv.org/pdf/2103.14030>_

PARAMETER DESCRIPTION
image_size

Input image size. Default 224

TYPE: int | tuple(int DEFAULT: 224

patch_size

Patch size. Default: 4

TYPE: int | tuple(int DEFAULT: 4

in_chans

Number of input image channels. Default: 3

TYPE: int DEFAULT: 3

num_classes

Number of classes for classification head. Default: 1000

TYPE: int DEFAULT: 1000

embed_dim

Patch embedding dimension. Default: 96

TYPE: int DEFAULT: 96

depths

Depth of each Swin Transformer layer.

TYPE: tuple(int DEFAULT: None

num_heads

Number of attention heads in different layers.

TYPE: tuple(int DEFAULT: None

window_size

Window size. Default: 7

TYPE: int DEFAULT: 7

mlp_ratio

Ratio of mlp hidden dim to embedding dim. Default: 4

TYPE: float DEFAULT: 4.0

qkv_bias

If True, add a learnable bias to query, key, value. Default: True

TYPE: bool DEFAULT: True

qk_scale

Override default qk scale of head_dim ** -0.5 if set. Default: None

TYPE: float DEFAULT: None

drop_rate

Dropout rate. Default: 0

TYPE: float DEFAULT: 0.0

attn_drop_rate

Attention dropout rate. Default: 0

TYPE: float DEFAULT: 0.0

drop_path_rate

Stochastic depth rate. Default: 0.1

TYPE: float DEFAULT: 0.1

norm_layer

Normalization layer. Default: nn.LayerNorm.

TYPE: Cell DEFAULT: LayerNorm

ape

If True, add absolute position embedding to the patch embedding. Default: False

TYPE: bool DEFAULT: False

patch_norm

If True, add normalization after patch embedding. Default: True

TYPE: bool DEFAULT: True

Source code in mindcv\models\swintransformer.py
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
class SwinTransformer(nn.Cell):
    r"""SwinTransformer model class, based on
    `"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_

    Args:
        image_size (int | tuple(int)): Input image size. Default 224
        patch_size (int | tuple(int)): Patch size. Default: 4
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        embed_dim (int): Patch embedding dimension. Default: 96
        depths (tuple(int)): Depth of each Swin Transformer layer.
        num_heads (tuple(int)): Number of attention heads in different layers.
        window_size (int): Window size. Default: 7
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
        drop_rate (float): Dropout rate. Default: 0
        attn_drop_rate (float): Attention dropout rate. Default: 0
        drop_path_rate (float): Stochastic depth rate. Default: 0.1
        norm_layer (nn.Cell): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
        patch_norm (bool): If True, add normalization after patch embedding. Default: True
    """

    def __init__(
        self,
        image_size: int = 224,
        patch_size: int = 4,
        in_chans: int = 3,
        num_classes: int = 1000,
        embed_dim: int = 96,
        depths: Optional[List[int]] = None,
        num_heads: Optional[List[int]] = None,
        window_size: int = 7,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = True,
        qk_scale: Optional[int] = None,
        drop_rate: float = 0.0,
        attn_drop_rate: float = 0.0,
        drop_path_rate: float = 0.1,
        norm_layer: Optional[nn.Cell] = nn.LayerNorm,
        ape: bool = False,
        patch_norm: bool = True,
    ) -> None:
        super().__init__()

        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            image_size=image_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = Parameter(Tensor(np.zeros(1, num_patches, embed_dim), dtype=mstype.float32))

        self.pos_drop = Dropout(p=drop_rate)

        # stochastic depth
        dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule

        # build layers
        self.layers = nn.CellList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
                                                 patches_resolution[1] // (2 ** i_layer)),
                               depth=depths[i_layer],
                               num_heads=num_heads[i_layer],
                               window_size=window_size,
                               mlp_ratio=self.mlp_ratio,
                               qkv_bias=qkv_bias, qk_scale=qk_scale,
                               drop=drop_rate, attn_drop=attn_drop_rate,
                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                               norm_layer=norm_layer,
                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None)
            self.layers.append(layer)

        self.norm = norm_layer([self.num_features, ], epsilon=1e-5)
        self.classifier = nn.Dense(in_channels=self.num_features,
                                   out_channels=num_classes, has_bias=True) if num_classes > 0 else Identity()
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=0.02),
                                                      cell.weight.shape, cell.weight.dtype))
                if isinstance(cell, nn.Dense) and cell.bias is not None:
                    cell.bias.set_data(init.initializer(init.Zero(), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer(init.One(), cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer(init.Zero(), cell.beta.shape, cell.beta.dtype))

    def no_weight_decay(self) -> None:
        return {"absolute_pos_embed"}

    def no_weight_decay_keywords(self) -> None:
        return {"relative_position_bias_table"}

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.classifier(x)
        return x

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        if self.ape:
            x = x + self.absolute_pos_embed
        x = self.pos_drop(x)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)  # B L C
        x = ops.mean(ops.transpose(x, (0, 2, 1)), 2)  # B C 1
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.swintransformer.SwinTransformerBlock

Bases: Cell

Swin Transformer Block.

PARAMETER DESCRIPTION
dim

Number of input channels.

TYPE: int

input_resolution

Input resolution.

TYPE: tuple[int]

num_heads

Number of attention heads.

TYPE: int

window_size

Window size.

TYPE: int DEFAULT: 7

shift_size

Shift size for SW-MSA.

TYPE: int DEFAULT: 0

mlp_ratio

Ratio of mlp hidden dim to embedding dim.

TYPE: float DEFAULT: 4.0

qkv_bias

If True, add a learnable bias to query, key, value. Default: True

TYPE: bool DEFAULT: True

qk_scale

Override default qk scale of head_dim ** -0.5 if set.

TYPE: float | None DEFAULT: None

drop

Dropout rate. Default: 0.0

TYPE: float DEFAULT: 0.0

attn_drop

Attention dropout rate. Default: 0.0

TYPE: float DEFAULT: 0.0

drop_path

Stochastic depth rate. Default: 0.0

TYPE: float DEFAULT: 0.0

act_layer

Activation layer. Default: nn.GELU

TYPE: Cell DEFAULT: GELU

norm_layer

Normalization layer. Default: nn.LayerNorm

TYPE: Cell DEFAULT: LayerNorm

Source code in mindcv\models\swintransformer.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
class SwinTransformerBlock(nn.Cell):
    """Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        input_resolution (tuple[int]): Input resolution.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Cell, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Cell, optional): Normalization layer.  Default: nn.LayerNorm
    """

    def __init__(
        self,
        dim: int,
        input_resolution: Tuple[int],
        num_heads: int,
        window_size: int = 7,
        shift_size: int = 0,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = True,
        qk_scale: Optional[float] = None,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        drop_path: float = 0.0,
        act_layer: Optional[nn.Cell] = nn.GELU,
        norm_layer: Optional[nn.Cell] = nn.LayerNorm,
    ) -> None:
        super(SwinTransformerBlock, self).__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        if min(self.input_resolution) <= self.window_size:
            # if window size is larger than input resolution, we don't partition windows
            self.shift_size = 0
            self.window_size = min(self.input_resolution)

        if isinstance(dim, int):
            dim = (dim,)

        self.norm1 = norm_layer(dim, epsilon=1e-5)
        self.attn = WindowAttention(
            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer(dim, epsilon=1e-5)
        mlp_hidden_dim = int((dim[0] if isinstance(dim, tuple) else dim) * mlp_ratio)
        self.mlp = Mlp(in_features=dim[0] if isinstance(dim, tuple) else dim, hidden_features=mlp_hidden_dim,
                       act_layer=act_layer, drop=drop)
        if self.shift_size > 0:
            # calculate attention mask for SW-MSA
            h_, w_ = self.input_resolution
            img_mask = np.zeros((1, h_, w_, 1))  # 1 H W 1
            h_slices = (slice(0, -self.window_size),
                        slice(-self.window_size, -self.shift_size),
                        slice(-self.shift_size, None))
            w_slices = (slice(0, -self.window_size),
                        slice(-self.window_size, -self.shift_size),
                        slice(-self.shift_size, None))
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1
            # img_mask: [1, 56, 56, 1] window_size: 7
            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
            mask_windows = mask_windows.reshape(-1, self.window_size * self.window_size)
            attn_mask = mask_windows[:, np.newaxis] - mask_windows[:, :, np.newaxis]
            # [64, 49, 49] ==> [1, 64, 1, 49, 49]
            attn_mask = np.expand_dims(attn_mask, axis=1)
            attn_mask = np.expand_dims(attn_mask, axis=0)
            attn_mask = Tensor(np.where(attn_mask == 0, 0.0, -100.0), dtype=mstype.float32)
            self.attn_mask = Parameter(attn_mask, requires_grad=False)
            self.roll_pos = Roll(self.shift_size)
            self.roll_neg = Roll(-self.shift_size)
        else:
            self.attn_mask = None

        self.window_partition = WindowPartition(self.window_size)
        self.window_reverse = WindowReverse()

    def construct(self, x: Tensor) -> Tensor:
        h, w = self.input_resolution
        b, _, c = x.shape

        shortcut = x
        x = self.norm1(x)
        x = ops.reshape(x, (b, h, w, c,))

        # cyclic shift
        if self.shift_size > 0:
            shifted_x = self.roll_neg(x)
            # shifted_x = numpy.roll(x, (-self.shift_size, -self.shift_size), (1, 2))
        else:
            shifted_x = x

        # partition windows
        x_windows = self.window_partition(shifted_x)  # nW*B, window_size, window_size, C
        x_windows = ops.reshape(x_windows,
                                (-1, self.window_size * self.window_size, c,))  # nW*B, window_size*window_size, C

        # W-MSA/SW-MSA
        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C

        # merge windows
        attn_windows = ops.reshape(attn_windows, (-1, self.window_size, self.window_size, c,))
        shifted_x = self.window_reverse(attn_windows, self.window_size, h, w)  # B H' W' C

        # reverse cyclic shift
        if self.shift_size > 0:
            x = self.roll_pos(shifted_x)
        else:
            x = shifted_x

        x = ops.reshape(x, (b, h * w, c,))

        # FFN
        x = shortcut + self.drop_path(x)

        x = x + self.drop_path(self.mlp(self.norm2(x)))

        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"

mindcv.models.swintransformer.WindowAttention

Bases: Cell

Window based multi-head self attention (W-MSA) Cell with relative position bias. It supports both of shifted and non-shifted window.

PARAMETER DESCRIPTION
dim

Number of input channels.

TYPE: int

window_size

The height and width of the window.

TYPE: tuple[int]

num_heads

Number of attention heads.

TYPE: int

qkv_bias

If True, add a learnable bias to query, key, value. Default: True

TYPE: bool DEFAULT: True

qZk_scale

Override default qk scale of head_dim ** -0.5 if set

TYPE: float | None

attn_drop

Dropout ratio of attention weight. Default: 0.0

TYPE: float DEFAULT: 0.0

proj_drop

Dropout ratio of output. Default: 0.0

TYPE: float DEFAULT: 0.0

Source code in mindcv\models\swintransformer.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
class WindowAttention(nn.Cell):
    r"""Window based multi-head self attention (W-MSA) Cell with relative position bias.
    It supports both of shifted and non-shifted window.

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qZk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    """

    def __init__(
        self,
        dim: int,
        window_size: int,
        num_heads: int,
        qkv_bias: bool = True,
        qk_scale: Optional[float] = None,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
    ) -> None:
        super().__init__()
        if isinstance(dim, tuple) and len(dim) == 1:
            dim = dim[0]
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = Tensor(qk_scale or head_dim**-0.5, mstype.float32)
        self.relative_bias = RelativeBias(self.window_size, num_heads)

        # get pair-wise relative position index for each token inside the window
        self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)
        self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias)

        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(in_channels=dim, out_channels=dim, has_bias=True)
        self.proj_drop = Dropout(p=proj_drop)
        self.softmax = nn.Softmax(axis=-1)
        self.batch_matmul = ops.BatchMatMul()

    def construct(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
        """
        Args:
            x: input features with shape of (num_windows*B, N, C)
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
        """
        b_, n, c = x.shape
        q = ops.reshape(self.q(x), (b_, n, self.num_heads, c // self.num_heads)) * self.scale
        q = ops.transpose(q, (0, 2, 1, 3))
        k = ops.reshape(self.k(x), (b_, n, self.num_heads, c // self.num_heads))
        k = ops.transpose(k, (0, 2, 3, 1))
        v = ops.reshape(self.v(x), (b_, n, self.num_heads, c // self.num_heads))
        v = ops.transpose(v, (0, 2, 1, 3))

        attn = self.batch_matmul(q, k)
        attn = attn + self.relative_bias()

        if mask is not None:
            nw = mask.shape[1]
            attn = ops.reshape(attn, (b_ // nw, nw, self.num_heads, n, n,)) + mask
            attn = ops.reshape(attn, (-1, self.num_heads, n, n,))
            attn = self.softmax(attn)
        else:
            attn = self.softmax(attn)
        attn = self.attn_drop(attn)
        x = ops.reshape(ops.transpose(self.batch_matmul(attn, v), (0, 2, 1, 3)), (b_, n, c))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    def extra_repr(self) -> str:
        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
mindcv.models.swintransformer.WindowAttention.construct(x, mask=None)
PARAMETER DESCRIPTION
x

input features with shape of (num_windows*B, N, C)

TYPE: Tensor

mask

(0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None

TYPE: Optional[Tensor] DEFAULT: None

Source code in mindcv\models\swintransformer.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def construct(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
    """
    Args:
        x: input features with shape of (num_windows*B, N, C)
        mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
    """
    b_, n, c = x.shape
    q = ops.reshape(self.q(x), (b_, n, self.num_heads, c // self.num_heads)) * self.scale
    q = ops.transpose(q, (0, 2, 1, 3))
    k = ops.reshape(self.k(x), (b_, n, self.num_heads, c // self.num_heads))
    k = ops.transpose(k, (0, 2, 3, 1))
    v = ops.reshape(self.v(x), (b_, n, self.num_heads, c // self.num_heads))
    v = ops.transpose(v, (0, 2, 1, 3))

    attn = self.batch_matmul(q, k)
    attn = attn + self.relative_bias()

    if mask is not None:
        nw = mask.shape[1]
        attn = ops.reshape(attn, (b_ // nw, nw, self.num_heads, n, n,)) + mask
        attn = ops.reshape(attn, (-1, self.num_heads, n, n,))
        attn = self.softmax(attn)
    else:
        attn = self.softmax(attn)
    attn = self.attn_drop(attn)
    x = ops.reshape(ops.transpose(self.batch_matmul(attn, v), (0, 2, 1, 3)), (b_, n, c))
    x = self.proj(x)
    x = self.proj_drop(x)
    return x

mindcv.models.swintransformer.WindowPartition

Bases: Cell

Source code in mindcv\models\swintransformer.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class WindowPartition(nn.Cell):
    def __init__(
        self,
        window_size: int,
    ) -> None:
        super(WindowPartition, self).__init__()

        self.window_size = window_size

    def construct(self, x: Tensor) -> Tensor:
        """
        Args:
            x: (b, h, w, c)
            window_size (int): window size

        Returns:
            windows: Tensor(num_windows*b, window_size, window_size, c)
        """
        b, h, w, c = x.shape
        x = ops.reshape(x, (b, h // self.window_size, self.window_size, w // self.window_size, self.window_size, c))
        x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
        x = ops.reshape(x, (b * h * w // (self.window_size**2), self.window_size, self.window_size, c))

        return x
mindcv.models.swintransformer.WindowPartition.construct(x)
PARAMETER DESCRIPTION
x

(b, h, w, c)

TYPE: Tensor

window_size

window size

TYPE: int

RETURNS DESCRIPTION
windows

Tensor(num_windows*b, window_size, window_size, c)

TYPE: Tensor

Source code in mindcv\models\swintransformer.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def construct(self, x: Tensor) -> Tensor:
    """
    Args:
        x: (b, h, w, c)
        window_size (int): window size

    Returns:
        windows: Tensor(num_windows*b, window_size, window_size, c)
    """
    b, h, w, c = x.shape
    x = ops.reshape(x, (b, h // self.window_size, self.window_size, w // self.window_size, self.window_size, c))
    x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
    x = ops.reshape(x, (b * h * w // (self.window_size**2), self.window_size, self.window_size, c))

    return x

mindcv.models.swintransformer.WindowReverse

Bases: Cell

Source code in mindcv\models\swintransformer.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class WindowReverse(nn.Cell):
    def construct(
        self,
        windows: Tensor,
        window_size: int,
        h: int,
        w: int,
    ) -> Tensor:
        """
        Args:
            windows: (num_windows*B, window_size, window_size, C)
            window_size (int): Window size
            h (int): Height of image
            w (int): Width of image

        Returns:
            x: (B, H, W, C)
        """
        b = windows.shape[0] // (h * w // window_size // window_size)
        x = ops.reshape(windows, (b, h // window_size, w // window_size, window_size, window_size, -1))
        x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
        x = ops.reshape(x, (b, h, w, -1))
        return x
mindcv.models.swintransformer.WindowReverse.construct(windows, window_size, h, w)
PARAMETER DESCRIPTION
windows

(num_windows*B, window_size, window_size, C)

TYPE: Tensor

window_size

Window size

TYPE: int

h

Height of image

TYPE: int

w

Width of image

TYPE: int

RETURNS DESCRIPTION
x

(B, H, W, C)

TYPE: Tensor

Source code in mindcv\models\swintransformer.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def construct(
    self,
    windows: Tensor,
    window_size: int,
    h: int,
    w: int,
) -> Tensor:
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        h (int): Height of image
        w (int): Width of image

    Returns:
        x: (B, H, W, C)
    """
    b = windows.shape[0] // (h * w // window_size // window_size)
    x = ops.reshape(windows, (b, h // window_size, w // window_size, window_size, window_size, -1))
    x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
    x = ops.reshape(x, (b, h, w, -1))
    return x

mindcv.models.swintransformer.swin_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get SwinTransformer tiny model. Refer to the base class 'models.SwinTransformer' for more details.

Source code in mindcv\models\swintransformer.py
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
@register_model
def swin_tiny(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> SwinTransformer:
    """Get SwinTransformer tiny model.
    Refer to the base class 'models.SwinTransformer' for more details.
    """
    default_cfg = default_cfgs["swin_tiny"]
    model = SwinTransformer(image_size=224, patch_size=4, in_chans=in_channels, num_classes=num_classes,
                            embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7,
                            mlp_ratio=4., qkv_bias=True, qk_scale=None,
                            drop_rate=0., attn_drop_rate=0., drop_path_rate=0.2,
                            norm_layer=nn.LayerNorm, ape=False, patch_norm=True, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.swintransformer.window_partition(x, window_size)

PARAMETER DESCRIPTION
x

(B, H, W, C)

window_size

window size

TYPE: int

RETURNS DESCRIPTION
windows

numpy(num_windows*B, window_size, window_size, C)

Source code in mindcv\models\swintransformer.py
65
66
67
68
69
70
71
72
73
74
75
76
77
def window_partition(x, window_size: int):
    """
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: numpy(num_windows*B, window_size, window_size, C)
    """
    b, h, w, c = x.shape
    x = np.reshape(x, (b, h // window_size, window_size, w // window_size, window_size, c))
    windows = x.transpose(0, 1, 3, 2, 4, 5).reshape(-1, window_size, window_size, c)
    return windows

swintransformerv2

mindcv.models.swintransformerv2

MindSpore implementation of SwinTransformer V2. Refer to Swin Transformer V2: Scaling Up Capacity and Resolution.

mindcv.models.swintransformerv2.SwinTransformerV2

Bases: Cell

SwinTransformerV2 model class, based on "Swin Transformer V2: Scaling Up Capacity and Resolution" <https://arxiv.org/abs/2111.09883>_

PARAMETER DESCRIPTION
image_size

Input image size. Default: 256.

TYPE: int DEFAULT: 256

patch_size

Patch size. Default: 4.

TYPE: int DEFAULT: 4

in_channels

Number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

num_classes

Number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

embed_dim

Patch embedding dimension. Default: 96.

TYPE: int DEFAULT: 96

depths

Depth of each Swin Transformer layer. Default: [2, 2, 6, 2].

TYPE: List[int] DEFAULT: [2, 2, 6, 2]

num_heads

Number of attention heads in different layers. Default: [3, 6, 12, 24].

TYPE: List[int] DEFAULT: [3, 6, 12, 24]

window_size

Window size. Default: 7.

TYPE: int DEFAULT: 7

mlp_ratio

Ratio of mlp hidden dim to embedding dim. Default: 4.

TYPE: float DEFAULT: 4.0

qkv_bias

If True, add a bias for query, key, value. Default: True.

TYPE: bool DEFAULT: True

drop_rate

Drop probability for the Dropout layer. Default: 0.

TYPE: float DEFAULT: 0.0

attn_drop_rate

Attention drop probability for the Dropout layer. Default: 0.

TYPE: float DEFAULT: 0.0

drop_path_rate

Stochastic depth rate. Default: 0.1.

TYPE: float DEFAULT: 0.1

norm_layer

Normalization layer. Default: nn.LayerNorm.

TYPE: Cell DEFAULT: LayerNorm

patch_norm

If True, add normalization after patch embedding. Default: True.

TYPE: bool DEFAULT: True

pretrained_window_sizes

Pretrained window sizes of each layer. Default: [0, 0, 0, 0].

TYPE: List[int] DEFAULT: [0, 0, 0, 0]

Source code in mindcv\models\swintransformerv2.py
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
class SwinTransformerV2(nn.Cell):
    r"""SwinTransformerV2 model class, based on
    `"Swin Transformer V2: Scaling Up Capacity and Resolution" <https://arxiv.org/abs/2111.09883>`_

    Args:
        image_size: Input image size. Default: 256.
        patch_size: Patch size. Default: 4.
        in_channels: Number the channels of the input. Default: 3.
        num_classes: Number of classification classes. Default: 1000.
        embed_dim: Patch embedding dimension. Default: 96.
        depths: Depth of each Swin Transformer layer. Default: [2, 2, 6, 2].
        num_heads: Number of attention heads in different layers. Default: [3, 6, 12, 24].
        window_size: Window size. Default: 7.
        mlp_ratio: Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias: If True, add a bias for query, key, value. Default: True.
        drop_rate: Drop probability for the Dropout layer. Default: 0.
        attn_drop_rate: Attention drop probability for the Dropout layer. Default: 0.
        drop_path_rate: Stochastic depth rate. Default: 0.1.
        norm_layer: Normalization layer. Default: nn.LayerNorm.
        patch_norm: If True, add normalization after patch embedding. Default: True.
        pretrained_window_sizes: Pretrained window sizes of each layer. Default: [0, 0, 0, 0].
    """

    def __init__(
        self,
        image_size: int = 256,
        patch_size: int = 4,
        in_channels: int = 3,
        num_classes: int = 1000,
        embed_dim: int = 96,
        depths: List[int] = [2, 2, 6, 2],
        num_heads: List[int] = [3, 6, 12, 24],
        window_size: int = 7,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = True,
        drop_rate: float = 0.0,
        attn_drop_rate: float = 0.0,
        drop_path_rate: float = 0.1,
        norm_layer: nn.Cell = nn.LayerNorm,
        patch_norm: bool = True,
        pretrained_window_sizes: List[int] = [0, 0, 0, 0],
    ) -> None:
        super().__init__()
        self.num_classes = num_classes
        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            image_size=image_size, patch_size=patch_size, in_chans=in_channels, embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None)
        num_patches = self.patch_embed.num_patches
        self.num_patches = num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        self.pos_drop = Dropout(p=drop_rate)

        # stochastic depth
        dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule

        # build layers
        self.layers = nn.CellList()
        self.final_seq = num_patches  # downsample seq_length
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2**i_layer),
                input_resolution=(patches_resolution[0] // (2**i_layer),
                                  patches_resolution[1] // (2**i_layer)),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=self.mlp_ratio,
                qkv_bias=qkv_bias,
                drop=drop_rate, attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
                pretrained_window_size=pretrained_window_sizes[i_layer]
            )
            # downsample seq_length
            if i_layer < self.num_layers - 1:
                self.final_seq = self.final_seq // 4
            self.layers.append(layer)
        self.head = nn.Dense(self.num_features, self.num_classes)

        self.norm = norm_layer([self.num_features, ], epsilon=1e-6)
        self.avgpool = ops.ReduceMean(keep_dims=False)

        self._initialize_weights()

    def _initialize_weights(self):
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(init.initializer(init.HeUniform(), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(init.initializer("ones", cell.gamma.shape, cell.gamma.dtype))
                cell.beta.set_data(init.initializer("zeros", cell.beta.shape, cell.beta.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.TruncatedNormal(sigma=0.02), cell.weight.shape, cell.weight.dtype)
                )
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.patch_embed(x)
        x = self.pos_drop(x)
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)  # B L C
        x = self.avgpool(ops.transpose(x, (0, 2, 1)), 2)  # B C 1
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.head(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

vgg

mindcv.models.vgg

MindSpore implementation of VGGNet. Refer to SqueezeNet: Very Deep Convolutional Networks for Large-Scale Image Recognition.

mindcv.models.vgg.VGG

Bases: Cell

VGGNet model class, based on "Very Deep Convolutional Networks for Large-Scale Image Recognition" <https://arxiv.org/abs/1409.1556>_

PARAMETER DESCRIPTION
model_name

name of the architecture. 'vgg11', 'vgg13', 'vgg16' or 'vgg19'.

TYPE: str

batch_norm

use batch normalization or not. Default: False.

TYPE: bool DEFAULT: False

num_classes

number of classification classes. Default: 1000.

TYPE: int DEFAULT: 1000

in_channels

number the channels of the input. Default: 3.

TYPE: int DEFAULT: 3

drop_rate

dropout rate of the classifier. Default: 0.5.

TYPE: float DEFAULT: 0.5

Source code in mindcv\models\vgg.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
class VGG(nn.Cell):
    r"""VGGNet model class, based on
    `"Very Deep Convolutional Networks for Large-Scale Image Recognition" <https://arxiv.org/abs/1409.1556>`_

    Args:
        model_name: name of the architecture. 'vgg11', 'vgg13', 'vgg16' or 'vgg19'.
        batch_norm: use batch normalization or not. Default: False.
        num_classes: number of classification classes. Default: 1000.
        in_channels: number the channels of the input. Default: 3.
        drop_rate: dropout rate of the classifier. Default: 0.5.
    """

    def __init__(
        self,
        model_name: str,
        batch_norm: bool = False,
        num_classes: int = 1000,
        in_channels: int = 3,
        drop_rate: float = 0.5,
    ) -> None:
        super().__init__()
        cfg = cfgs[model_name]
        self.features = _make_layers(cfg, batch_norm=batch_norm, in_channels=in_channels)
        self.flatten = nn.Flatten()
        self.classifier = nn.SequentialCell([
            nn.Dense(512 * 7 * 7, 4096),
            nn.ReLU(),
            Dropout(p=drop_rate),
            nn.Dense(4096, 4096),
            nn.ReLU(),
            Dropout(p=drop_rate),
            nn.Dense(4096, num_classes),
        ])
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        """Initialize weights for cells."""
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Conv2d):
                cell.weight.set_data(
                    init.initializer(init.HeNormal(math.sqrt(5), mode="fan_out", nonlinearity="relu"),
                                     cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(
                        init.initializer("zeros", cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    init.initializer(init.Normal(0.01), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(init.initializer("zeros", cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.features(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        x = self.flatten(x)
        x = self.classifier(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.vgg.vgg11(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 11 layers VGG model. Refer to the base class models.VGG for more details.

Source code in mindcv\models\vgg.py
138
139
140
141
142
143
144
145
146
147
148
149
@register_model
def vgg11(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> VGG:
    """Get 11 layers VGG model.
    Refer to the base class `models.VGG` for more details.
    """
    default_cfg = default_cfgs["vgg11"]
    model = VGG(model_name="vgg11", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.vgg.vgg13(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 13 layers VGG model. Refer to the base class models.VGG for more details.

Source code in mindcv\models\vgg.py
152
153
154
155
156
157
158
159
160
161
162
163
@register_model
def vgg13(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> VGG:
    """Get 13 layers VGG model.
    Refer to the base class `models.VGG` for more details.
    """
    default_cfg = default_cfgs["vgg13"]
    model = VGG(model_name="vgg13", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.vgg.vgg16(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 16 layers VGG model. Refer to the base class models.VGG for more details.

Source code in mindcv\models\vgg.py
166
167
168
169
170
171
172
173
174
175
176
177
@register_model
def vgg16(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> VGG:
    """Get 16 layers VGG model.
    Refer to the base class `models.VGG` for more details.
    """
    default_cfg = default_cfgs["vgg16"]
    model = VGG(model_name="vgg16", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.vgg.vgg19(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get 19 layers VGG model. Refer to the base class models.VGG for more details.

Source code in mindcv\models\vgg.py
180
181
182
183
184
185
186
187
188
189
190
191
@register_model
def vgg19(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> VGG:
    """Get 19 layers VGG model.
    Refer to the base class `models.VGG` for more details.
    """
    default_cfg = default_cfgs["vgg19"]
    model = VGG(model_name="vgg19", num_classes=num_classes, in_channels=in_channels, **kwargs)

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

visformer

mindcv.models.visformer

MindSpore implementation of Visformer. Refer to: Visformer: The Vision-friendly Transformer

mindcv.models.visformer.Attention

Bases: Cell

Attention layer

Source code in mindcv\models\visformer.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class Attention(nn.Cell):
    """Attention layer"""

    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        head_dim_ratio: float = 1.0,
        qkv_bias: bool = False,
        qk_scale: float = None,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
    ) -> None:
        super(Attention, self).__init__()
        self.dim = dim
        self.num_heads = num_heads
        head_dim = round(dim // num_heads * head_dim_ratio)
        self.head_dim = head_dim

        qk_scale_factor = qk_scale if qk_scale is not None else -0.25
        self.scale = head_dim**qk_scale_factor

        self.qkv = nn.Conv2d(dim, head_dim * num_heads * 3, 1, 1, pad_mode="pad", padding=0, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Conv2d(self.head_dim * self.num_heads, dim, 1, 1, pad_mode="pad", padding=0)
        self.proj_drop = Dropout(p=proj_drop)

    def construct(self, x: Tensor) -> Tensor:
        B, C, H, W = x.shape
        x = self.qkv(x)
        qkv = ops.reshape(x, (B, 3, self.num_heads, self.head_dim, H * W))
        qkv = qkv.transpose((1, 0, 2, 4, 3))
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = ops.matmul(q * self.scale, k.transpose(0, 1, 3, 2) * self.scale)
        attn = ops.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)
        x = ops.matmul(attn, v)

        x = x.transpose((0, 1, 3, 2)).reshape((B, -1, H, W))
        x = self.proj(x)
        x = self.proj_drop(x)

        return x

mindcv.models.visformer.Block

Bases: Cell

visformer basic block

Source code in mindcv\models\visformer.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class Block(nn.Cell):
    """visformer basic block"""

    def __init__(
        self,
        dim: int,
        num_heads: int,
        head_dim_ratio: float = 1.0,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_scale: float = None,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        drop_path: float = 0.0,
        act_layer: nn.Cell = nn.GELU,
        group: int = 8,
        attn_disabled: bool = False,
        spatial_conv: bool = False,
    ) -> None:
        super(Block, self).__init__()
        self.attn_disabled = attn_disabled
        self.spatial_conv = spatial_conv
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        if not attn_disabled:
            self.norm1 = nn.BatchNorm2d(dim)
            self.attn = Attention(dim, num_heads=num_heads, head_dim_ratio=head_dim_ratio, qkv_bias=qkv_bias,
                                  qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.norm2 = nn.BatchNorm2d(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop,
                       group=group, spatial_conv=spatial_conv)

    def construct(self, x: Tensor) -> Tensor:
        if not self.attn_disabled:
            x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.visformer.Mlp

Bases: Cell

MLP layer

Source code in mindcv\models\visformer.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class Mlp(nn.Cell):
    """MLP layer"""

    def __init__(
        self,
        in_features: int,
        hidden_features: int = None,
        out_features: int = None,
        act_layer: nn.Cell = nn.GELU,
        drop: float = 0.0,
        group: int = 8,
        spatial_conv: bool = False,
    ) -> None:
        super(Mlp, self).__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.in_features = in_features
        self.out_features = out_features
        self.spatial_conv = spatial_conv
        if self.spatial_conv:
            if group < 2:
                hidden_features = in_features * 5 // 6
            else:
                hidden_features = in_features * 2
        self.hidden_features = hidden_features
        self.group = group
        self.drop = Dropout(p=drop)
        self.conv1 = nn.Conv2d(in_features, hidden_features, 1, 1, pad_mode="pad", padding=0)
        self.act1 = act_layer()
        if self.spatial_conv:
            self.conv2 = nn.Conv2d(hidden_features, hidden_features, 3, 1, pad_mode="pad", padding=1, group=self.group)
            self.act2 = act_layer()
        self.conv3 = nn.Conv2d(hidden_features, out_features, 1, 1, pad_mode="pad", padding=0)

    def construct(self, x: Tensor) -> Tensor:
        x = self.conv1(x)
        x = self.act1(x)
        x = self.drop(x)

        if self.spatial_conv:
            x = self.conv2(x)
            x = self.act2(x)

        x = self.conv3(x)
        x = self.drop(x)
        return x

mindcv.models.visformer.Visformer

Bases: Cell

Visformer model class, based on '"Visformer: The Vision-friendly Transformer" https://arxiv.org/pdf/2104.12533.pdf'

PARAMETER DESCRIPTION
image_size

images input size. Default: 224.

TYPE: int)

number

32.

TYPE: the channels of the input. Default

num_classes

number of classification classes. Default: 1000.

TYPE: int) DEFAULT: 1000

embed_dim

embedding dimension in all head. Default: 384.

TYPE: int) DEFAULT: 384

depth

model block depth. Default: None.

TYPE: int) DEFAULT: None

num_heads

number of heads. Default: None.

TYPE: int) DEFAULT: None

mlp_ratio

ratio of hidden features in Mlp. Default: 4.

TYPE: float) DEFAULT: 4.0

qkv_bias

have bias in qkv layers or not. Default: False.

TYPE: bool) DEFAULT: False

qk_scale

Override default qk scale of head_dim ** -0.5 if set.

TYPE: float) DEFAULT: None

drop_rate

dropout rate. Default: 0.

TYPE: float) DEFAULT: 0.0

attn_drop_rate

attention layers dropout rate. Default: 0.

TYPE: float) DEFAULT: 0.0

drop_path_rate

drop path rate. Default: 0.1.

TYPE: float) DEFAULT: 0.1

attn_stage

block will have a attention layer if value = '1' else not. Default: '1111'.

TYPE: str) DEFAULT: '1111'

pos_embed

position embedding. Default: True.

TYPE: bool) DEFAULT: True

spatial_conv

block will have a spatial convolution layer if value = '1' else not. Default: '1111'.

TYPE: str) DEFAULT: '1111'

group

convolution group. Default: 8.

TYPE: int) DEFAULT: 8

pool

if true will use global_pooling else not. Default: True.

TYPE: bool) DEFAULT: True

conv_init

if true will init convolution weights else not. Default: False.

DEFAULT: False

Source code in mindcv\models\visformer.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
class Visformer(nn.Cell):
    r"""Visformer model class, based on
    '"Visformer: The Vision-friendly Transformer"
    <https://arxiv.org/pdf/2104.12533.pdf>'

    Args:
        image_size (int) : images input size. Default: 224.
        number the channels of the input. Default: 32.
        num_classes (int) : number of classification classes. Default: 1000.
        embed_dim (int) : embedding dimension in all head. Default: 384.
        depth (int) : model block depth. Default: None.
        num_heads (int) : number of heads. Default: None.
        mlp_ratio (float) : ratio of hidden features in Mlp. Default: 4.
        qkv_bias (bool) : have bias in qkv layers or not. Default: False.
        qk_scale (float) : Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float) : dropout rate. Default: 0.
        attn_drop_rate (float) : attention layers dropout rate. Default: 0.
        drop_path_rate (float) : drop path rate. Default: 0.1.
        attn_stage (str) : block will have a attention layer if value = '1' else not. Default: '1111'.
        pos_embed (bool) : position embedding. Default: True.
        spatial_conv (str) : block will have a spatial convolution layer if value = '1' else not. Default: '1111'.
        group (int) : convolution group. Default: 8.
        pool (bool) : if true will use global_pooling else not. Default: True.
        conv_init : if true will init convolution weights else not. Default: False.
    """

    def __init__(
        self,
        img_size: int = 224,
        init_channels: int = 32,
        num_classes: int = 1000,
        embed_dim: int = 384,
        depth: List[int] = None,
        num_heads: List[int] = None,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        qk_scale: float = None,
        drop_rate: float = 0.0,
        attn_drop_rate: float = 0.0,
        drop_path_rate: float = 0.1,
        attn_stage: str = "1111",
        pos_embed: bool = True,
        spatial_conv: str = "1111",
        group: int = 8,
        pool: bool = True,
        conv_init: bool = False,
    ) -> None:
        super(Visformer, self).__init__()
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim
        self.init_channels = init_channels
        self.img_size = img_size
        self.pool = pool
        self.conv_init = conv_init
        self.depth = depth
        assert (isinstance(depth, list) or isinstance(depth, tuple)) and len(depth) == 4
        if not (isinstance(num_heads, list) or isinstance(num_heads, tuple)):
            num_heads = [num_heads] * 4

        self.pos_embed = pos_embed
        dpr = np.linspace(0, drop_path_rate, sum(depth)).tolist()

        self.stem = nn.SequentialCell([
            nn.Conv2d(3, self.init_channels, 7, 2, pad_mode="pad", padding=3),
            nn.BatchNorm2d(self.init_channels),
            nn.ReLU()
        ])
        img_size //= 2

        self.pos_drop = Dropout(p=drop_rate)
        # stage0
        if depth[0]:
            self.patch_embed0 = PatchEmbed(img_size=img_size, patch_size=2, in_chans=self.init_channels,
                                           embed_dim=embed_dim // 4)
            img_size //= 2
            if self.pos_embed:
                self.pos_embed0 = mindspore.Parameter(
                    ops.zeros((1, embed_dim // 4, img_size, img_size), mindspore.float32))
            self.stage0 = nn.CellList([
                Block(dim=embed_dim // 4, num_heads=num_heads[0], head_dim_ratio=0.25, mlp_ratio=mlp_ratio,
                      qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                      group=group, attn_disabled=(attn_stage[0] == "0"), spatial_conv=(spatial_conv[0] == "1"))
                for i in range(depth[0])
            ])

        # stage1
        if depth[0]:
            self.patch_embed1 = PatchEmbed(img_size=img_size, patch_size=2, in_chans=embed_dim // 4,
                                           embed_dim=embed_dim // 2)
            img_size //= 2
        else:
            self.patch_embed1 = PatchEmbed(img_size=img_size, patch_size=4, in_chans=self.init_channels,
                                           embed_dim=embed_dim // 2)
            img_size //= 4

        if self.pos_embed:
            self.pos_embed1 = mindspore.Parameter(ops.zeros((1, embed_dim // 2, img_size, img_size), mindspore.float32))

        self.stage1 = nn.CellList([
            Block(
                dim=embed_dim // 2, num_heads=num_heads[1], head_dim_ratio=0.5, mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                group=group, attn_disabled=(attn_stage[1] == "0"), spatial_conv=(spatial_conv[1] == "1")
            )
            for i in range(sum(depth[:1]), sum(depth[:2]))
        ])

        # stage2
        self.patch_embed2 = PatchEmbed(img_size=img_size, patch_size=2, in_chans=embed_dim // 2, embed_dim=embed_dim)
        img_size //= 2
        if self.pos_embed:
            self.pos_embed2 = mindspore.Parameter(ops.zeros((1, embed_dim, img_size, img_size), mindspore.float32))
        self.stage2 = nn.CellList([
            Block(
                dim=embed_dim, num_heads=num_heads[2], head_dim_ratio=1.0, mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                group=group, attn_disabled=(attn_stage[2] == "0"), spatial_conv=(spatial_conv[2] == "1")
            )
            for i in range(sum(depth[:2]), sum(depth[:3]))
        ])

        # stage3
        self.patch_embed3 = PatchEmbed(img_size=img_size, patch_size=2, in_chans=embed_dim, embed_dim=embed_dim * 2)
        img_size //= 2
        if self.pos_embed:
            self.pos_embed3 = mindspore.Parameter(ops.zeros((1, embed_dim * 2, img_size, img_size), mindspore.float32))
        self.stage3 = nn.CellList([
            Block(
                dim=embed_dim * 2, num_heads=num_heads[3], head_dim_ratio=1.0, mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                group=group, attn_disabled=(attn_stage[3] == "0"), spatial_conv=(spatial_conv[3] == "1")
            )
            for i in range(sum(depth[:3]), sum(depth[:4]))
        ])

        # head
        if self.pool:
            self.global_pooling = GlobalAvgPooling()

        self.norm = nn.BatchNorm2d(embed_dim * 2)
        self.head = nn.Dense(embed_dim * 2, num_classes)

        # weight init
        if self.pos_embed:
            if depth[0]:
                self.pos_embed0.set_data(initializer(TruncatedNormal(0.02),
                                                     self.pos_embed0.shape, self.pos_embed0.dtype))
            self.pos_embed1.set_data(initializer(TruncatedNormal(0.02),
                                                 self.pos_embed1.shape, self.pos_embed1.dtype))
            self.pos_embed2.set_data(initializer(TruncatedNormal(0.02),
                                                 self.pos_embed2.shape, self.pos_embed2.dtype))
            self.pos_embed3.set_data(initializer(TruncatedNormal(0.02),
                                                 self.pos_embed3.shape, self.pos_embed3.dtype))
        self._initialize_weights()

    def _initialize_weights(self) -> None:
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(initializer(TruncatedNormal(0.02), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(initializer(Constant(0), cell.bias.shape, cell.bias.dtype))
            elif isinstance(cell, nn.LayerNorm):
                cell.beta.set_data(initializer(Constant(0), cell.beta.shape, cell.beta.dtype))
                cell.gamma.set_data(initializer(Constant(1), cell.gamma.shape, cell.gamma.dtype))
            elif isinstance(cell, nn.BatchNorm2d):
                cell.beta.set_data(initializer(Constant(0), cell.beta.shape, cell.beta.dtype))
                cell.gamma.set_data(initializer(Constant(1), cell.gamma.shape, cell.gamma.dtype))
            elif isinstance(cell, nn.Conv2d):
                if self.conv_init:
                    cell.weight.set_data(initializer(HeNormal(mode="fan_out", nonlinearity="relu"), cell.weight.shape,
                                                     cell.weight.dtype))
                else:
                    cell.weight.set_data(initializer(TruncatedNormal(0.02), cell.weight.shape, cell.weight.dtype))
                if cell.bias is not None:
                    cell.bias.set_data(initializer(Constant(0), cell.bias.shape, cell.bias.dtype))

    def forward_features(self, x: Tensor) -> Tensor:
        x = self.stem(x)

        # stage 0
        if self.depth[0]:
            x = self.patch_embed0(x)
            if self.pos_embed:
                x = x + self.pos_embed0
                x = self.pos_drop(x)
            for b in self.stage0:
                x = b(x)

        # stage 1
        x = self.patch_embed1(x)
        if self.pos_embed:
            x = x + self.pos_embed1
            x = self.pos_drop(x)
        for b in self.stage1:
            x = b(x)

        # stage 2
        x = self.patch_embed2(x)
        if self.pos_embed:
            x = x + self.pos_embed2
            x = self.pos_drop(x)
        for b in self.stage2:
            x = b(x)

        # stage 3
        x = self.patch_embed3(x)
        if self.pos_embed:
            x = x + self.pos_embed3
            x = self.pos_drop(x)
        for b in self.stage3:
            x = b(x)
        x = self.norm(x)
        return x

    def forward_head(self, x: Tensor) -> Tensor:
        # head
        if self.pool:
            x = self.global_pooling(x)
        else:
            x = x[:, :, 0, 0]
        x = self.head(x.view(x.shape[0], -1))
        return x

    def construct(self, x: Tensor) -> Tensor:
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

mindcv.models.visformer.visformer_small(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get visformer small model. Refer to the base class 'models.visformer' for more details.

Source code in mindcv\models\visformer.py
468
469
470
471
472
473
474
475
476
477
478
479
@register_model
def visformer_small(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """Get visformer small model.
    Refer to the base class 'models.visformer' for more details.
    """
    default_cfg = default_cfgs["visformer_small"]
    model = Visformer(img_size=224, init_channels=32, num_classes=num_classes, embed_dim=384,
                      depth=[0, 7, 4, 4], num_heads=[6, 6, 6, 6], mlp_ratio=4., group=8,
                      attn_stage="0011", spatial_conv="1100", conv_init=True, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.visformer.visformer_small_v2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get visformer small2 model. Refer to the base class 'models.visformer' for more details.

Source code in mindcv\models\visformer.py
482
483
484
485
486
487
488
489
490
491
492
493
@register_model
def visformer_small_v2(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """Get visformer small2 model.
    Refer to the base class 'models.visformer' for more details.
    """
    default_cfg = default_cfgs["visformer_small_v2"]
    model = Visformer(img_size=224, init_channels=32, num_classes=num_classes, embed_dim=256,
                      depth=[1, 10, 14, 3], num_heads=[2, 4, 8, 16], mlp_ratio=4., qk_scale=-0.5,
                      group=8, attn_stage="0011", spatial_conv="1100", conv_init=True, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.visformer.visformer_tiny(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get visformer tiny model. Refer to the base class 'models.visformer' for more details.

Source code in mindcv\models\visformer.py
439
440
441
442
443
444
445
446
447
448
449
450
451
@register_model
def visformer_tiny(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """Get visformer tiny model.
    Refer to the base class 'models.visformer' for more details.
    """
    default_cfg = default_cfgs["visformer_tiny"]
    model = Visformer(img_size=224, init_channels=16, num_classes=num_classes, embed_dim=192,
                      depth=[0, 7, 4, 4], num_heads=[3, 3, 3, 3], mlp_ratio=4., group=8,
                      attn_stage="0011", spatial_conv="1100", drop_path_rate=0.03, conv_init=True, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)

    return model

mindcv.models.visformer.visformer_tiny_v2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get visformer tiny2 model. Refer to the base class 'models.visformer' for more details.

Source code in mindcv\models\visformer.py
454
455
456
457
458
459
460
461
462
463
464
465
@register_model
def visformer_tiny_v2(pretrained: bool = False, num_classes: int = 1000, in_channels: int = 3, **kwargs):
    """Get visformer tiny2 model.
    Refer to the base class 'models.visformer' for more details.
    """
    default_cfg = default_cfgs["visformer_tiny_v2"]
    model = Visformer(img_size=224, init_channels=24, num_classes=num_classes, embed_dim=192,
                      depth=[1, 4, 6, 3], num_heads=[1, 3, 6, 12], mlp_ratio=4., qk_scale=-0.5, group=8,
                      attn_stage="0011", spatial_conv="1100", drop_path_rate=0.03, conv_init=True, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

vit

mindcv.models.vit

ViT

mindcv.models.vit.Attention

Bases: Cell

Attention layer implementation, Rearrange Input -> B x N x hidden size.

PARAMETER DESCRIPTION
dim

The dimension of input features.

TYPE: int

num_heads

The number of attention heads. Default: 8.

TYPE: int DEFAULT: 8

qkv_bias

Specifies whether the linear layer uses a bias vector. Default: True.

TYPE: bool DEFAULT: True

qk_norm

Specifies whether to do normalization to q and k.

TYPE: bool DEFAULT: False

attn_drop

The drop rate of attention, greater than 0 and less equal than 1. Default: 0.0.

TYPE: float DEFAULT: 0.0

proj_drop

The drop rate of output, greater than 0 and less equal than 1. Default: 0.0.

TYPE: float DEFAULT: 0.0

RETURNS DESCRIPTION

Tensor, output tensor.

Examples:

>>> ops = Attention(768, 12)
Source code in mindcv\models\vit.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class Attention(nn.Cell):
    """
    Attention layer implementation, Rearrange Input -> B x N x hidden size.

    Args:
        dim (int): The dimension of input features.
        num_heads (int): The number of attention heads. Default: 8.
        qkv_bias (bool): Specifies whether the linear layer uses a bias vector. Default: True.
        qk_norm (bool): Specifies whether to do normalization to q and k.
        attn_drop (float): The drop rate of attention, greater than 0 and less equal than 1. Default: 0.0.
        proj_drop (float): The drop rate of output, greater than 0 and less equal than 1. Default: 0.0.

    Returns:
        Tensor, output tensor.

    Examples:
        >>> ops = Attention(768, 12)
    """
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
        norm_layer: nn.Cell = nn.LayerNorm,
    ):
        super(Attention, self).__init__()
        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = Tensor(self.head_dim ** -0.5)

        self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias)
        self.q_norm = norm_layer((self.head_dim,)) if qk_norm else nn.Identity()
        self.k_norm = norm_layer((self.head_dim,)) if qk_norm else nn.Identity()

        self.attn_drop = Dropout(attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(proj_drop)

        self.mul = ops.Mul()
        self.reshape = ops.Reshape()
        self.transpose = ops.Transpose()
        self.unstack = ops.Unstack(axis=0)
        self.attn_matmul_v = ops.BatchMatMul()
        self.q_matmul_k = ops.BatchMatMul(transpose_b=True)

    def construct(self, x):
        b, n, c = x.shape
        qkv = self.qkv(x)
        qkv = self.reshape(qkv, (b, n, 3, self.num_heads, self.head_dim))
        qkv = self.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = self.unstack(qkv)
        q, k = self.q_norm(q), self.k_norm(k)

        q = self.mul(q, self.scale**0.5)
        k = self.mul(k, self.scale**0.5)
        attn = self.q_matmul_k(q, k)

        attn = ops.softmax(attn.astype(ms.float32), axis=-1).astype(attn.dtype)
        attn = self.attn_drop(attn)

        out = self.attn_matmul_v(attn, v)
        out = self.transpose(out, (0, 2, 1, 3))
        out = self.reshape(out, (b, n, c))
        out = self.proj(out)
        out = self.proj_drop(out)

        return out

mindcv.models.vit.Block

Bases: Cell

Transformer block implementation.

PARAMETER DESCRIPTION
dim

The dimension of embedding.

TYPE: int

num_heads

The number of attention heads.

TYPE: int DEFAULT: 8

qkv_bias

Specifies whether the linear layer uses a bias vector. Default: True.

TYPE: bool DEFAULT: False

attn_drop

The drop rate of attention, greater than 0 and less equal than 1. Default: 0.0.

TYPE: float DEFAULT: 0.0

proj_drop

The drop rate of dense layer output, greater than 0 and less equal than 1. Default: 0.0.

TYPE: float DEFAULT: 0.0

mlp_ratio

The ratio used to scale the input dimensions to obtain the dimensions of the hidden layer.

TYPE: float DEFAULT: 4.0

drop_path

The drop rate for drop path. Default: 0.0.

TYPE: float DEFAULT: 0.0

act_layer

Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. Default: nn.GELU.

TYPE: Cell DEFAULT: GELU

norm_layer

Norm layer that will be stacked on top of the convolution layer. Default: nn.LayerNorm.

TYPE: Cell DEFAULT: LayerNorm

RETURNS DESCRIPTION

Tensor, output tensor.

Examples:

>>> ops = TransformerEncoder(768, 12, 12, 3072)
Source code in mindcv\models\vit.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
class Block(nn.Cell):
    """
    Transformer block implementation.

    Args:
        dim (int): The dimension of embedding.
        num_heads (int): The number of attention heads.
        qkv_bias (bool): Specifies whether the linear layer uses a bias vector. Default: True.
        attn_drop (float): The drop rate of attention, greater than 0 and less equal than 1. Default: 0.0.
        proj_drop (float): The drop rate of dense layer output, greater than 0 and less equal than 1. Default: 0.0.
        mlp_ratio (float): The ratio used to scale the input dimensions to obtain the dimensions of the hidden layer.
        drop_path (float): The drop rate for drop path. Default: 0.0.
        act_layer (nn.Cell): Activation function which will be stacked on top of the
            normalization layer (if not None), otherwise on top of the conv layer. Default: nn.GELU.
        norm_layer (nn.Cell): Norm layer that will be stacked on top of the convolution
            layer. Default: nn.LayerNorm.

    Returns:
        Tensor, output tensor.

    Examples:
        >>> ops = TransformerEncoder(768, 12, 12, 3072)
    """
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        mlp_ratio: float = 4.,
        qkv_bias: bool = False,
        qk_norm: bool = False,
        proj_drop: float = 0.,
        attn_drop: float = 0.,
        init_values: Optional[float] = None,
        drop_path: float = 0.,
        act_layer: nn.Cell = nn.GELU,
        norm_layer: nn.Cell = nn.LayerNorm,
        mlp_layer: Callable = Mlp,
    ):
        super(Block, self).__init__()
        self.norm1 = norm_layer((dim,))
        self.attn = Attention(
            dim=dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_norm=qk_norm,
            attn_drop=attn_drop,
            proj_drop=proj_drop,
            norm_layer=norm_layer,
        )
        self.ls1 = LayerScale(dim=dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()

        self.norm2 = norm_layer((dim,))
        self.mlp = mlp_layer(
            in_features=dim,
            hidden_features=int(dim * mlp_ratio),
            act_layer=act_layer,
            drop=proj_drop
        )
        self.ls2 = LayerScale(dim=dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()

    def construct(self, x):
        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
        return x

mindcv.models.vit.LayerScale

Bases: Cell

Layer scale, help ViT improve the training dynamic, allowing for the training of deeper high-capacity image transformers that benefit from depth

PARAMETER DESCRIPTION
dim

The output dimension of attnetion layer or mlp layer.

TYPE: int

init_values

The scale factor. Default: 1e-5.

TYPE: float DEFAULT: 1e-05

RETURNS DESCRIPTION

Tensor, output tensor.

Examples:

>>> ops = LayerScale(768, 0.01)
Source code in mindcv\models\vit.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class LayerScale(nn.Cell):
    """
    Layer scale, help ViT improve the training dynamic, allowing for the training
    of deeper high-capacity image transformers that benefit from depth

    Args:
        dim (int): The output dimension of attnetion layer or mlp layer.
        init_values (float): The scale factor. Default: 1e-5.

    Returns:
        Tensor, output tensor.

    Examples:
        >>> ops = LayerScale(768, 0.01)
    """
    def __init__(
        self,
        dim: int,
        init_values: float = 1e-5
    ):
        super(LayerScale, self).__init__()
        self.gamma = Parameter(initializer(init_values, dim))

    def construct(self, x):
        return self.gamma * x

mindcv.models.vit.VisionTransformer

Bases: Cell

ViT encoder, which returns the feature encoded by transformer encoder.

Source code in mindcv\models\vit.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
class VisionTransformer(nn.Cell):
    '''
    ViT encoder, which returns the feature encoded by transformer encoder.
    '''
    def __init__(
        self,
        image_size: int = 224,
        patch_size: int = 16,
        in_channels: int = 3,
        global_pool: str = 'token',
        embed_dim: int = 768,
        depth: int = 12,
        num_heads: int = 12,
        mlp_ratio: float = 4.,
        qkv_bias: bool = True,
        qk_norm: bool = False,
        drop_rate: float = 0.,
        pos_drop_rate: float = 0.,
        patch_drop_rate: float = 0.,
        proj_drop_rate: float = 0.,
        attn_drop_rate: float = 0.,
        drop_path_rate: float = 0.,
        weight_init: bool = True,
        init_values: Optional[float] = None,
        no_embed_class: bool = False,
        pre_norm: bool = False,
        fc_norm: Optional[bool] = None,
        dynamic_img_size: bool = False,
        dynamic_img_pad: bool = False,
        act_layer: nn.Cell = nn.GELU,
        embed_layer: Callable = PatchEmbed,
        norm_layer: nn.Cell = nn.LayerNorm,
        mlp_layer: Callable = Mlp,
        class_token: bool = True,
        block_fn: Callable = Block,
        num_classes: int = 1000,
    ):
        super(VisionTransformer, self).__init__()
        assert global_pool in ('', 'avg', 'token')
        assert class_token or global_pool != 'token'
        use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm

        self.global_pool = global_pool
        self.num_prefix_tokens = 1 if class_token else 0
        self.no_embed_class = no_embed_class
        self.dynamic_img_size = dynamic_img_size
        self.dynamic_img_pad = dynamic_img_pad

        embed_args = {}
        if dynamic_img_size:
            # flatten deferred until after pos embed
            embed_args.update(dict(strict_img_size=False, output_fmt='NHWC'))
        elif dynamic_img_pad:
            embed_args.update(dict(output_fmt='NHWC'))

        self.patch_embed = embed_layer(
            image_size=image_size,
            patch_size=patch_size,
            in_chans=in_channels,
            embed_dim=embed_dim,
            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
            dynamic_img_pad=dynamic_img_pad,
            **embed_args,
        )
        num_patches = self.patch_embed.num_patches

        self.cls_token = Parameter(initializer(TruncatedNormal(0.02), (1, 1, embed_dim))) if class_token else None
        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
        self.pos_embed = Parameter(initializer(TruncatedNormal(0.02), (1, embed_len, embed_dim)))
        self.pos_drop = Dropout(pos_drop_rate)
        if patch_drop_rate > 0:
            self.patch_drop = PatchDropout(
                patch_drop_rate,
                num_prefix_tokens=self.num_prefix_tokens,
            )
        else:
            self.patch_drop = nn.Identity()

        self.norm_pre = norm_layer((embed_dim,)) if pre_norm else nn.Identity()
        dpr = [x.item() for x in np.linspace(0, drop_path_rate, depth)]
        self.blocks = nn.CellList([
            block_fn(
                dim=embed_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm,
                attn_drop=attn_drop_rate, proj_drop=proj_drop_rate,
                mlp_ratio=mlp_ratio, drop_path=dpr[i], init_values=init_values,
                act_layer=act_layer, norm_layer=norm_layer, mlp_layer=mlp_layer,
            ) for i in range(depth)
        ])

        self.norm = norm_layer((embed_dim,)) if not use_fc_norm else nn.Identity()
        self.fc_norm = norm_layer((embed_dim,)) if use_fc_norm else nn.Identity()
        self.head_drop = Dropout(drop_rate)
        self.head = nn.Dense(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        if weight_init:
            self._init_weights()

    def get_num_layers(self):
        return len(self.blocks)

    def _init_weights(self):
        w = self.patch_embed.proj.weight
        w_shape_flatted = (w.shape[0], functools.reduce(lambda x, y: x*y, w.shape[1:]))
        w_value = initializer(XavierUniform(), w_shape_flatted, w.dtype)
        w_value.init_data()
        w.set_data(w_value.reshape(w.shape))
        for _, cell in self.cells_and_names():
            if isinstance(cell, nn.Dense):
                cell.weight.set_data(
                    initializer(XavierUniform(), cell.weight.shape, cell.weight.dtype)
                )
                if cell.bias is not None:
                    cell.bias.set_data(
                        initializer('zeros', cell.bias.shape, cell.bias.dtype)
                    )
            elif isinstance(cell, nn.LayerNorm):
                cell.gamma.set_data(
                    initializer('ones', cell.gamma.shape, cell.gamma.dtype)
                )
                cell.beta.set_data(
                    initializer('zeros', cell.beta.shape, cell.beta.dtype)
                )

    def _pos_embed(self, x):
        if self.dynamic_img_size or self.dynamic_img_pad:
            # bhwc format
            B, H, W, C = x.shape
            pos_embed = resample_abs_pos_embed(
                self.pos_embed,
                (H, W),
                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
            )
            x = ops.reshape(x, (B, -1, C))
        else:
            pos_embed = self.pos_embed

        if self.no_embed_class:
            # deit-3, updated JAX (big vision)
            # position embedding does not overlap with class token, add then concat
            x = x + pos_embed
            if self.cls_token is not None:
                cls_tokens = ops.broadcast_to(self.cls_token, (x.shape[0], -1, -1))
                cls_tokens = cls_tokens.astype(x.dtype)
                x = ops.concat((cls_tokens, x), axis=1)
        else:
            # original timm, JAX, and deit vit impl
            # pos_embed has entry for class token, concat then add
            if self.cls_token is not None:
                cls_tokens = ops.broadcast_to(self.cls_token, (x.shape[0], -1, -1))
                cls_tokens = cls_tokens.astype(x.dtype)
                x = ops.concat((cls_tokens, x), axis=1)
            x = x + pos_embed

        return self.pos_drop(x)

    def forward_features(self, x):
        x = self.patch_embed(x)
        x = self._pos_embed(x)
        x = self.patch_drop(x)
        x = self.norm_pre(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x

    def forward_head(self, x):
        if self.global_pool:
            x = x[:, self.num_prefix_tokens:].mean(axis=1) if self.global_pool == 'avg' else x[:, 0]
        x = self.fc_norm(x)
        x = self.head_drop(x)
        x = self.head(x)
        return x

    def construct(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x

volo

mindcv.models.volo

Vision OutLOoker (VOLO) implementation Modified from timm/models/vision_transformer.py

mindcv.models.volo.Attention

Bases: Cell

Implementation of self-attention

Source code in mindcv\models\volo.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
class Attention(nn.Cell):
    "Implementation of self-attention"

    def __init__(
        self,
        dim,
        num_heads=8,
        qkv_bias=False,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.softmax = nn.Softmax(axis=-1)
        self.batch_mat_mul_transpose = ops.BatchMatMul(transpose_b=True)
        self.batch_mat_mul = ops.BatchMatMul()

    def construct(self, x: Tensor) -> Tensor:
        B, H, W, C = x.shape

        qkv = self.qkv(x)
        qkv = ops.reshape(qkv, (B, H * W, 3, self.num_heads, C // self.num_heads))
        qkv = ops.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = self.batch_mat_mul_transpose(q, k) * self.scale
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        x = ops.transpose(self.batch_mat_mul(attn, v), (0, 2, 1, 3))
        x = ops.reshape(x, (B, H, W, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

mindcv.models.volo.ClassAttention

Bases: Cell

Class attention layer from CaiT, see details in CaiT Class attention is the post stage in our VOLO, which is optional.

Source code in mindcv\models\volo.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
class ClassAttention(nn.Cell):
    """
    Class attention layer from CaiT, see details in CaiT
    Class attention is the post stage in our VOLO, which is optional.
    """
    def __init__(
        self,
        dim,
        num_heads=8,
        head_dim=None,
        qkv_bias=False,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        if head_dim is not None:
            self.head_dim = head_dim
        else:
            head_dim = dim // num_heads
            self.head_dim = head_dim
        self.scale = qk_scale or head_dim**-0.5

        self.kv = nn.Dense(dim, self.head_dim * self.num_heads * 2, has_bias=qkv_bias)
        self.q = nn.Dense(dim, self.head_dim * self.num_heads, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(self.head_dim * self.num_heads, dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.batch_mat_mul_transpose = ops.BatchMatMul(transpose_b=True)
        self.batch_mat_mul = ops.BatchMatMul()
        self.softmax = nn.Softmax(axis=-1)

    def construct(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        kv = self.kv(x)
        kv = ops.reshape(kv, (B, N, 2, self.num_heads,
                         self.head_dim))
        kv = ops.transpose(kv, (2, 0, 3, 1, 4))
        k, v = kv[0], kv[1]
        q = self.q(x[:, :1, :])
        q = ops.reshape(q, (B, self.num_heads, 1, self.head_dim))
        attn = self.batch_mat_mul_transpose(q * self.scale, k)
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        cls_embed = ops.transpose(self.batch_mat_mul(attn, v), (0, 2, 1, 3))
        cls_embed = ops.reshape(cls_embed, (B, 1, self.head_dim * self.num_heads))
        cls_embed = self.proj(cls_embed)
        cls_embed = self.proj_drop(cls_embed)
        return cls_embed

mindcv.models.volo.ClassBlock

Bases: Cell

Class attention block from CaiT, see details in CaiT We use two-layers class attention in our VOLO, which is optional.

Source code in mindcv\models\volo.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
class ClassBlock(nn.Cell):
    """
    Class attention block from CaiT, see details in CaiT
    We use two-layers class attention in our VOLO, which is optional.
    """

    def __init__(
        self,
        dim,
        num_heads,
        head_dim=None,
        mlp_ratio=4.,
        qkv_bias=False,
        qk_scale=None,
        drop=0.0,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
    ) -> None:
        super().__init__()
        self.norm1 = norm_layer([dim])
        self.attn = ClassAttention(
            dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias,
            qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
        # NOTE: drop path for stochastic depth
        self.drop_path = DropPath(
            drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer([dim])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer,
                       drop=drop)

    def construct(self, x: Tensor) -> Tensor:
        cls_embed = x[:, :1]
        cls_embed = cls_embed + self.drop_path(self.attn(self.norm1(x)))
        cls_embed = cls_embed + self.drop_path(self.mlp(self.norm2(cls_embed)))
        x = ops.concat([cls_embed, x[:, 1:]], 1)
        return x

mindcv.models.volo.Downsample

Bases: Cell

Image to Patch Embedding, downsampling between stage1 and stage2

Source code in mindcv\models\volo.py
489
490
491
492
493
494
495
496
497
498
499
500
501
502
class Downsample(nn.Cell):
    """
    Image to Patch Embedding, downsampling between stage1 and stage2
    """
    def __init__(self, in_embed_dim, out_embed_dim, patch_size,) -> None:
        super().__init__()
        self.proj = nn.Conv2d(in_embed_dim, out_embed_dim,
                              kernel_size=patch_size, stride=patch_size, has_bias=True)

    def construct(self, x: Tensor) -> Tensor:
        x = ops.transpose(x, (0, 3, 1, 2))
        x = self.proj(x)  # B, C, H, W
        x = ops.transpose(x, (0, 2, 3, 1))
        return x

mindcv.models.volo.Fold

Bases: Cell

Source code in mindcv\models\volo.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class Fold(nn.Cell):
    def __init__(self, channels, output_size, kernel_size, dilation=1, padding=0, stride=1) -> None:
        """Alternative implementation of fold layer via transposed convolution.
        All parameters are same as `"torch.nn.Fold" <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html>`_,
        except for the additional `channels` parameter. We need `channels` to calculate the pre-allocated memory
        size of the convolution kernel.
        :param channels: same as the `C` in the document of `"torch.nn.Fold"
                         <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html>`_
        :type channels: int
        """
        super().__init__()

        def int2tuple(a):
            if isinstance(a, int):
                return (a, a)
            return a
        self.output_size, self.kernel_size, self.dilation, self.padding, self.stride = map(
                                    int2tuple, (output_size, kernel_size, dilation, padding, stride))
        self.h = int((self.output_size[0] + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1)
                     / self.stride[0] + 1)
        self.w = int((self.output_size[1] + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1)
                     / self.stride[1] + 1)
        self.k = self.kernel_size[0] * self.kernel_size[1]
        self.c = channels
        self.ck = self.c * self.k
        init_weight = np.zeros((self.ck, 1, self.kernel_size[0], self.kernel_size[1]))
        for i in range(self.ck):
            xy = i % self.k
            x = xy // self.kernel_size[1]
            y = xy % self.kernel_size[1]
            init_weight[i, 0, x, y] = 1

        self.weight = ms.Tensor(init_weight, ms.float16)
        self.conv_transpose2d = ops.Conv2DTranspose(
                                    self.ck, self.kernel_size, pad_mode="pad",
                                    pad=(self.padding[0], self.padding[0], self.padding[1], self.padding[1]),
                                    stride=stride, dilation=dilation, group=self.c)

    def construct(self, x: Tensor) -> Tensor:
        b, ck, hw = x.shape
        # todo: assert is not allowed in construct, how to check the shape?
        # assert ck == self.c * self.k
        # assert l == self.h * self.w
        # print("construct-b", b, "construct-ck", ck, "construct-l", l)
        # print("self.h", self.h, "self.w", self.w)
        x = ops.reshape(x, (b, ck, self.h, self.w))
        out = self.conv_transpose2d(x, self.weight, (b, self.c, self.output_size[0], self.output_size[1]))

        return out
mindcv.models.volo.Fold.__init__(channels, output_size, kernel_size, dilation=1, padding=0, stride=1)

Alternative implementation of fold layer via transposed convolution. All parameters are same as "torch.nn.Fold" <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html>, except for the additional channels parameter. We need channels to calculate the pre-allocated memory size of the convolution kernel. :param channels: same as the C in the document of "torch.nn.Fold" <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html> :type channels: int

Source code in mindcv\models\volo.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(self, channels, output_size, kernel_size, dilation=1, padding=0, stride=1) -> None:
    """Alternative implementation of fold layer via transposed convolution.
    All parameters are same as `"torch.nn.Fold" <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html>`_,
    except for the additional `channels` parameter. We need `channels` to calculate the pre-allocated memory
    size of the convolution kernel.
    :param channels: same as the `C` in the document of `"torch.nn.Fold"
                     <https://pytorch.org/docs/stable/generated/torch.nn.Fold.html>`_
    :type channels: int
    """
    super().__init__()

    def int2tuple(a):
        if isinstance(a, int):
            return (a, a)
        return a
    self.output_size, self.kernel_size, self.dilation, self.padding, self.stride = map(
                                int2tuple, (output_size, kernel_size, dilation, padding, stride))
    self.h = int((self.output_size[0] + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1)
                 / self.stride[0] + 1)
    self.w = int((self.output_size[1] + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1)
                 / self.stride[1] + 1)
    self.k = self.kernel_size[0] * self.kernel_size[1]
    self.c = channels
    self.ck = self.c * self.k
    init_weight = np.zeros((self.ck, 1, self.kernel_size[0], self.kernel_size[1]))
    for i in range(self.ck):
        xy = i % self.k
        x = xy // self.kernel_size[1]
        y = xy % self.kernel_size[1]
        init_weight[i, 0, x, y] = 1

    self.weight = ms.Tensor(init_weight, ms.float16)
    self.conv_transpose2d = ops.Conv2DTranspose(
                                self.ck, self.kernel_size, pad_mode="pad",
                                pad=(self.padding[0], self.padding[0], self.padding[1], self.padding[1]),
                                stride=stride, dilation=dilation, group=self.c)

mindcv.models.volo.Mlp

Bases: Cell

Implementation of MLP

Source code in mindcv\models\volo.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class Mlp(nn.Cell):
    "Implementation of MLP"

    def __init__(
        self,
        in_features,
        hidden_features=None,
        out_features=None,
        act_layer=nn.GELU,
        drop=0.0,
    ) -> None:
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Dense(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Dense(hidden_features, out_features)
        self.drop = Dropout(p=drop)

    def construct(self, x: Tensor) -> Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

mindcv.models.volo.OutlookAttention

Bases: Cell

Implementation of outlook attention --dim: hidden dim --num_heads: number of heads --kernel_size: kernel size in each window for outlook attention return: token features after outlook attention

Source code in mindcv\models\volo.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class OutlookAttention(nn.Cell):
    """
    Implementation of outlook attention
    --dim: hidden dim
    --num_heads: number of heads
    --kernel_size: kernel size in each window for outlook attention
    return: token features after outlook attention
    """

    def __init__(
        self,
        dim,
        num_heads,
        kernel_size=3,
        padding=1,
        stride=1,
        qkv_bias=False,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ) -> None:
        super().__init__()
        head_dim = dim // num_heads
        self.num_heads = num_heads
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride
        self.scale = qk_scale or head_dim**-0.5

        self.v = nn.Dense(dim, dim, has_bias=qkv_bias)
        self.attn = nn.Dense(dim, kernel_size**4 * num_heads)

        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(dim, dim)
        self.proj_drop = Dropout(p=proj_drop)

        self.unfold = nn.Unfold(ksizes=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1],
                                rates=[1, 1, 1, 1])
        self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
        self.softmax = nn.Softmax(axis=-1)
        self.batch_mat_mul = ops.BatchMatMul()

    def construct(self, x: Tensor) -> Tensor:
        B, H, W, C = x.shape

        v = ops.transpose(self.v(x), (0, 3, 1, 2))  # B, C, H, W

        h = int((H - 1) / self.stride + 1)
        w = int((W - 1) / self.stride + 1)
        v = ops.pad(v, ((0, 0), (0, 0), (1, 1), (1, 1)))
        v = self.unfold(v)
        v = ops.reshape(v, (B, self.num_heads, C // self.num_heads, self.kernel_size * self.kernel_size, h * w))
        v = ops.transpose(v, (0, 1, 4, 3, 2))  # B,H,N,kxk,C/H

        attn = self.pool(ops.transpose(x, (0, 3, 1, 2)))
        attn = ops.transpose(attn, (0, 2, 3, 1))
        attn = ops.reshape(self.attn(attn), (B, h * w, self.num_heads, self.kernel_size * self.kernel_size,
                           self.kernel_size * self.kernel_size))
        attn = ops.transpose(attn, (0, 2, 1, 3, 4))  # B,H,N,kxk,kxk
        attn = attn * self.scale
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)

        x = ops.transpose(self.batch_mat_mul(attn, v), (0, 1, 4, 3, 2))
        x = ops.reshape(x, (B, C * self.kernel_size * self.kernel_size, h * w))
        fold = Fold(C, (H, W), self.kernel_size, padding=self.padding, stride=self.stride)
        x = fold(x)
        x = self.proj(ops.transpose(x, (0, 2, 3, 1)))
        x = self.proj_drop(x)

        return x

mindcv.models.volo.Outlooker

Bases: Cell

Implementation of outlooker layer: which includes outlook attention + MLP Outlooker is the first stage in our VOLO --dim: hidden dim --num_heads: number of heads --mlp_ratio: mlp ratio --kernel_size: kernel size in each window for outlook attention return: outlooker layer

Source code in mindcv\models\volo.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class Outlooker(nn.Cell):
    """
    Implementation of outlooker layer: which includes outlook attention + MLP
    Outlooker is the first stage in our VOLO
    --dim: hidden dim
    --num_heads: number of heads
    --mlp_ratio: mlp ratio
    --kernel_size: kernel size in each window for outlook attention
    return: outlooker layer
    """
    def __init__(
        self,
        dim,
        kernel_size,
        padding,
        stride=1,
        num_heads=1,
        mlp_ratio=3.,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
        qkv_bias=False,
        qk_scale=None,
    ) -> None:
        super().__init__()
        self.norm1 = norm_layer([dim])
        self.attn = OutlookAttention(dim, num_heads, kernel_size=kernel_size,
                                     padding=padding, stride=stride,
                                     qkv_bias=qkv_bias, qk_scale=qk_scale,
                                     attn_drop=attn_drop)

        self.drop_path = DropPath(
            drop_path) if drop_path > 0.0 else Identity()

        self.norm2 = norm_layer([dim])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer)

    def construct(self, x: Tensor) -> Tensor:
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.volo.PatchEmbed

Bases: Cell

Image to Patch Embedding. Different with ViT use 1 conv layer, we use 4 conv layers to do patch embedding

Source code in mindcv\models\volo.py
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
class PatchEmbed(nn.Cell):
    """
    Image to Patch Embedding.
    Different with ViT use 1 conv layer, we use 4 conv layers to do patch embedding
    """

    def __init__(
        self,
        img_size=224,
        stem_conv=False,
        stem_stride=1,
        patch_size=8,
        in_channels=3,
        hidden_dim=64,
        embed_dim=384,
    ) -> None:
        super().__init__()
        assert patch_size in [4, 8, 16]

        self.stem_conv = stem_conv
        if stem_conv:
            self.conv = nn.SequentialCell(
                nn.Conv2d(in_channels, hidden_dim, 7, stem_stride,
                          pad_mode='pad', padding=3),  # 112x112
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(),
                nn.Conv2d(hidden_dim, hidden_dim, 3, 1,
                          pad_mode='pad', padding=1),  # 112x112
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(),
                nn.Conv2d(hidden_dim, hidden_dim, 3, 1,
                          pad_mode='pad', padding=1),  # 112x112
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU(),
            )

        self.proj = nn.Conv2d(hidden_dim,
                              embed_dim,
                              kernel_size=patch_size // stem_stride,
                              stride=patch_size // stem_stride, has_bias=True)
        self.num_patches = (img_size // patch_size) * (img_size // patch_size)

    def construct(self, x: Tensor) -> Tensor:
        if self.stem_conv:
            x = self.conv(x)
        x = self.proj(x)  # B, C, H, W
        return x

mindcv.models.volo.Transformer

Bases: Cell

Implementation of Transformer, Transformer is the second stage in our VOLO

Source code in mindcv\models\volo.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
class Transformer(nn.Cell):
    """
    Implementation of Transformer,
    Transformer is the second stage in our VOLO
    """
    def __init__(
        self,
        dim,
        num_heads,
        mlp_ratio=4.,
        qkv_bias=False,
        qk_scale=None,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
    ) -> None:
        super().__init__()
        self.norm1 = norm_layer([dim])
        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias,
                              qk_scale=qk_scale, attn_drop=attn_drop)

        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
        self.drop_path = DropPath(
            drop_path) if drop_path > 0.0 else Identity()

        self.norm2 = norm_layer([dim])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim,
                       hidden_features=mlp_hidden_dim,
                       act_layer=act_layer)

    def construct(self, x: Tensor) -> Tensor:
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

mindcv.models.volo.VOLO

Bases: Cell

Vision Outlooker, the main class of our model --layers: [x,x,x,x], four blocks in two stages, the first block is outlooker, the other three are transformer, we set four blocks, which are easily applied to downstream tasks --img_size, --in_channels, --num_classes: these three are very easy to understand --patch_size: patch_size in outlook attention --stem_hidden_dim: hidden dim of patch embedding, d1-d4 is 64, d5 is 128 --embed_dims, --num_heads: embedding dim, number of heads in each block --downsamples: flags to apply downsampling or not --outlook_attention: flags to apply outlook attention or not --mlp_ratios, --qkv_bias, --qk_scale, --drop_rate: easy to undertand --attn_drop_rate, --drop_path_rate, --norm_layer: easy to undertand --post_layers: post layers like two class attention layers using [ca, ca], if yes, return_mean=False --return_mean: use mean of all feature tokens for classification, if yes, no class token --return_dense: use token labeling, details are here: https://github.com/zihangJiang/TokenLabeling --mix_token: mixing tokens as token labeling, details are here: https://github.com/zihangJiang/TokenLabeling --pooling_scale: pooling_scale=2 means we downsample 2x --out_kernel, --out_stride, --out_padding: kerner size, stride, and padding for outlook attention

Source code in mindcv\models\volo.py
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
class VOLO(nn.Cell):
    """
    Vision Outlooker, the main class of our model
    --layers: [x,x,x,x], four blocks in two stages, the first block is outlooker, the
              other three are transformer, we set four blocks, which are easily
              applied to downstream tasks
    --img_size, --in_channels, --num_classes: these three are very easy to understand
    --patch_size: patch_size in outlook attention
    --stem_hidden_dim: hidden dim of patch embedding, d1-d4 is 64, d5 is 128
    --embed_dims, --num_heads: embedding dim, number of heads in each block
    --downsamples: flags to apply downsampling or not
    --outlook_attention: flags to apply outlook attention or not
    --mlp_ratios, --qkv_bias, --qk_scale, --drop_rate: easy to undertand
    --attn_drop_rate, --drop_path_rate, --norm_layer: easy to undertand
    --post_layers: post layers like two class attention layers using [ca, ca],
                  if yes, return_mean=False
    --return_mean: use mean of all feature tokens for classification, if yes, no class token
    --return_dense: use token labeling, details are here:
                    https://github.com/zihangJiang/TokenLabeling
    --mix_token: mixing tokens as token labeling, details are here:
                    https://github.com/zihangJiang/TokenLabeling
    --pooling_scale: pooling_scale=2 means we downsample 2x
    --out_kernel, --out_stride, --out_padding: kerner size,
                                               stride, and padding for outlook attention
    """
    def __init__(
        self,
        layers,
        img_size=224,
        in_channels=3,
        num_classes=1000,
        patch_size=8,
        stem_hidden_dim=64,
        embed_dims=None,
        num_heads=None,
        downsamples=None,
        outlook_attention=None,
        mlp_ratios=None,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        post_layers=None,
        return_mean=False,
        return_dense=True,
        mix_token=True,
        pooling_scale=2,
        out_kernel=3,
        out_stride=2,
        out_padding=1,
    ) -> None:

        super().__init__()
        self.num_classes = num_classes
        self.patch_embed = PatchEmbed(stem_conv=True, stem_stride=2, patch_size=patch_size,
                                      in_channels=in_channels, hidden_dim=stem_hidden_dim,
                                      embed_dim=embed_dims[0])
        # inital positional encoding, we add positional encoding after outlooker blocks
        self.pos_embed = Parameter(
            ops.zeros((1, img_size // patch_size // pooling_scale,
                      img_size // patch_size // pooling_scale,
                      embed_dims[-1]), mstype.float32))

        self.pos_drop = Dropout(p=drop_rate)

        # set the main block in network
        network = []
        for i in range(len(layers)):
            if outlook_attention[i]:
                # stage 1
                stage = outlooker_blocks(Outlooker, i, embed_dims[i], layers,
                                         downsample=downsamples[i], num_heads=num_heads[i],
                                         kernel_size=out_kernel, stride=out_stride,
                                         padding=out_padding, mlp_ratio=mlp_ratios[i],
                                         qkv_bias=qkv_bias, qk_scale=qk_scale,
                                         attn_drop=attn_drop_rate, norm_layer=norm_layer)
                network.append(stage)
            else:
                # stage 2
                stage = transformer_blocks(Transformer, i, embed_dims[i], layers,
                                           num_heads[i], mlp_ratio=mlp_ratios[i],
                                           qkv_bias=qkv_bias, qk_scale=qk_scale,
                                           drop_path_rate=drop_path_rate,
                                           attn_drop=attn_drop_rate,
                                           norm_layer=norm_layer)
                network.append(stage)

            if downsamples[i]:
                # downsampling between two stages
                network.append(Downsample(embed_dims[i], embed_dims[i + 1], 2))

        self.network = nn.CellList(network)

        # set post block, for example, class attention layers
        self.post_network = None
        if post_layers is not None:
            self.post_network = nn.CellList([
                get_block(post_layers[i],
                          dim=embed_dims[-1],
                          num_heads=num_heads[-1],
                          mlp_ratio=mlp_ratios[-1],
                          qkv_bias=qkv_bias,
                          qk_scale=qk_scale,
                          attn_drop=attn_drop_rate,
                          drop_path=0.0,
                          norm_layer=norm_layer)
                for i in range(len(post_layers))
            ])
            self.cls_token = Parameter(ops.zeros((1, 1, embed_dims[-1]), mstype.float32))
            self.cls_token.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.cls_token.data.shape))

        # set output type
        self.return_mean = return_mean  # if yes, return mean, not use class token
        self.return_dense = return_dense  # if yes, return class token and all feature tokens
        if return_dense:
            assert not return_mean, "cannot return both mean and dense"
        self.mix_token = mix_token
        self.pooling_scale = pooling_scale
        if mix_token:  # enable token mixing, see token labeling for details.
            self.beta = 1.0
            assert return_dense, "return all tokens if mix_token is enabled"
        if return_dense:
            self.aux_head = nn.Dense(
                embed_dims[-1],
                num_classes) if num_classes > 0 else Identity()
        self.norm = norm_layer([embed_dims[-1]])

        # Classifier head
        self.head = nn.Dense(
            embed_dims[-1], num_classes) if num_classes > 0 else Identity()

        self.pos_embed.set_data(init.initializer(init.TruncatedNormal(sigma=.02), self.pos_embed.data.shape))
        self._init_weights()

    def _init_weights(self) -> None:
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Dense):
                m.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), m.weight.data.shape))
                if m.bias is not None:
                    m.bias.set_data(init.initializer(init.Constant(0), m.bias.shape))
            elif isinstance(m, nn.LayerNorm):
                m.gamma.set_data(init.initializer(init.Constant(1), m.gamma.shape))
                m.beta.set_data(init.initializer(init.Constant(0), m.beta.shape))

    def forward_embeddings(self, x: Tensor) -> Tensor:
        # patch embedding
        x = self.patch_embed(x)
        # B,C,H,W-> B,H,W,C
        x = ops.transpose(x, (0, 2, 3, 1))
        return x

    def forward_tokens(self, x: Tensor) -> Tensor:
        for idx, block in enumerate(self.network):
            if idx == 2:  # add positional encoding after outlooker blocks
                x = x + self.pos_embed
                x = self.pos_drop(x)
            x = block(x)

        B, H, W, C = x.shape
        x = ops.reshape(x, (B, -1, C))
        return x

    def forward_cls(self, x: Tensor) -> Tensor:
        # B, N, C = x.shape
        cls_tokens = ops.broadcast_to(self.cls_token, (x.shape[0], -1, -1))
        x = ops.Cast()(x, cls_tokens.dtype)
        x = ops.concat([cls_tokens, x], 1)
        for block in self.post_network:
            x = block(x)
        return x

    def construct(self, x: Tensor) -> Tensor:
        # step1: patch embedding
        x = self.forward_embeddings(x)

        # step2: tokens learning in the two stages
        x = self.forward_tokens(x)

        # step3: post network, apply class attention or not
        if self.post_network is not None:
            x = self.forward_cls(x)
        x = self.norm(x)

        if self.return_mean:  # if no class token, return mean
            return self.head(ops.mean(x, 1))

        x_cls = self.head(x[:, 0])
        if not self.return_dense:
            return x_cls

        return x_cls

mindcv.models.volo.get_block(block_type, **kargs)

get block by name, specifically for class attention block in here

Source code in mindcv\models\volo.py
432
433
434
435
436
437
def get_block(block_type, **kargs) -> ClassBlock:
    """
    get block by name, specifically for class attention block in here
    """
    if block_type == 'ca':
        return ClassBlock(**kargs)

mindcv.models.volo.outlooker_blocks(block_fn, index, dim, layers, num_heads=1, kernel_size=3, padding=1, stride=1, mlp_ratio=3.0, qkv_bias=False, qk_scale=None, attn_drop=0.0, drop_path_rate=0.0, **kwargs)

generate outlooker layer in stage1 return: outlooker layers

Source code in mindcv\models\volo.py
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
def outlooker_blocks(block_fn, index, dim, layers, num_heads=1, kernel_size=3,
                     padding=1, stride=1, mlp_ratio=3., qkv_bias=False, qk_scale=None,
                     attn_drop=0.0, drop_path_rate=0.0, **kwargs) -> nn.SequentialCell:
    """
    generate outlooker layer in stage1
    return: outlooker layers
    """
    blocks = []
    for block_idx in range(layers[index]):
        block_dpr = drop_path_rate * (block_idx +
                                      sum(layers[:index])) / (sum(layers) - 1)
        blocks.append(block_fn(dim, kernel_size=kernel_size, padding=padding,
                               stride=stride, num_heads=num_heads, mlp_ratio=mlp_ratio,
                               qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
                               drop_path=block_dpr))

    blocks = nn.SequentialCell(*blocks)

    return blocks

mindcv.models.volo.transformer_blocks(block_fn, index, dim, layers, num_heads, mlp_ratio=3.0, qkv_bias=False, qk_scale=None, attn_drop=0, drop_path_rate=0.0, **kwargs)

generate transformer layers in stage2 return: transformer layers

Source code in mindcv\models\volo.py
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
def transformer_blocks(block_fn, index, dim, layers, num_heads, mlp_ratio=3.,
                       qkv_bias=False, qk_scale=None, attn_drop=0,
                       drop_path_rate=0.0, **kwargs) -> nn.SequentialCell:
    """
    generate transformer layers in stage2
    return: transformer layers
    """
    blocks = []
    for block_idx in range(layers[index]):
        block_dpr = drop_path_rate * (block_idx +
                                      sum(layers[:index])) / (sum(layers) - 1)
        blocks.append(
            block_fn(dim, num_heads,
                     mlp_ratio=mlp_ratio,
                     qkv_bias=qkv_bias,
                     qk_scale=qk_scale,
                     attn_drop=attn_drop,
                     drop_path=block_dpr))

    blocks = nn.SequentialCell(*blocks)

    return blocks

mindcv.models.volo.volo_d1(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

VOLO-D1 model, Params: 27M --layers: [x,x,x,x], four blocks in two stages, the first stage(block) is outlooker, the other three blocks are transformer, we set four blocks, which are easily applied to downstream tasks --embed_dims, --num_heads,: embedding dim, number of heads in each block --downsamples: flags to apply downsampling or not in four blocks --outlook_attention: flags to apply outlook attention or not --mlp_ratios: mlp ratio in four blocks --post_layers: post layers like two class attention layers using [ca, ca] See detail for all args in the class VOLO()

Source code in mindcv\models\volo.py
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
@register_model
def volo_d1(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """
    VOLO-D1 model, Params: 27M
    --layers: [x,x,x,x], four blocks in two stages, the first stage(block) is outlooker,
            the other three blocks are transformer, we set four blocks, which are easily
             applied to downstream tasks
    --embed_dims, --num_heads,: embedding dim, number of heads in each block
    --downsamples: flags to apply downsampling or not in four blocks
    --outlook_attention: flags to apply outlook attention or not
    --mlp_ratios: mlp ratio in four blocks
    --post_layers: post layers like two class attention layers using [ca, ca]
    See detail for all args in the class VOLO()
    """
    default_cfg = default_cfgs['volo_d1']

    # first block is outlooker (stage1), the other three are transformer (stage2)
    model = VOLO(layers=[4, 4, 8, 2],
                 in_channels=in_channels,
                 num_classes=num_classes,
                 embed_dims=[192, 384, 384, 384],
                 num_heads=[6, 12, 12, 12],
                 mlp_ratios=[3, 3, 3, 3],
                 downsamples=[True, False, False, False],
                 outlook_attention=[True, False, False, False],
                 post_layers=['ca', 'ca'],
                 **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.volo.volo_d2(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

VOLO-D2 model, Params: 59M

Source code in mindcv\models\volo.py
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
@register_model
def volo_d2(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """
    VOLO-D2 model, Params: 59M
    """
    default_cfg = default_cfgs['volo_d2']
    model = VOLO(layers=[6, 4, 10, 4],
                 in_channels=in_channels,
                 num_classes=num_classes,
                 embed_dims=[256, 512, 512, 512],
                 num_heads=[8, 16, 16, 16],
                 mlp_ratios=[3, 3, 3, 3],
                 downsamples=[True, False, False, False],
                 outlook_attention=[True, False, False, False],
                 post_layers=['ca', 'ca'],
                 **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.volo.volo_d3(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

VOLO-D3 model, Params: 86M

Source code in mindcv\models\volo.py
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
@register_model
def volo_d3(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """
    VOLO-D3 model, Params: 86M
    """
    default_cfg = default_cfgs['volo_d3']
    model = VOLO(layers=[8, 8, 16, 4],
                 in_channels=in_channels,
                 num_classes=num_classes,
                 embed_dims=[256, 512, 512, 512],
                 num_heads=[8, 16, 16, 16],
                 mlp_ratios=[3, 3, 3, 3],
                 downsamples=[True, False, False, False],
                 outlook_attention=[True, False, False, False],
                 post_layers=['ca', 'ca'],
                 **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.volo.volo_d4(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

VOLO-D4 model, Params: 193M

Source code in mindcv\models\volo.py
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
@register_model
def volo_d4(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """
    VOLO-D4 model, Params: 193M
    """
    default_cfg = default_cfgs['volo_d4']
    model = VOLO(layers=[8, 8, 16, 4],
                 in_channels=in_channels,
                 num_classes=num_classes,
                 embed_dims=[384, 768, 768, 768],
                 num_heads=[12, 16, 16, 16],
                 mlp_ratios=[3, 3, 3, 3],
                 downsamples=[True, False, False, False],
                 outlook_attention=[True, False, False, False],
                 post_layers=['ca', 'ca'],
                 **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

mindcv.models.volo.volo_d5(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

VOLO-D5 model, Params: 296M stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5

Source code in mindcv\models\volo.py
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
@register_model
def volo_d5(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs):
    """
    VOLO-D5 model, Params: 296M
    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5
    """
    default_cfg = default_cfgs['volo_d5']
    model = VOLO(layers=[12, 12, 20, 4],
                 embed_dims=[384, 768, 768, 768],
                 num_heads=[12, 16, 16, 16],
                 mlp_ratios=[4, 4, 4, 4],
                 downsamples=[True, False, False, False],
                 outlook_attention=[True, False, False, False],
                 post_layers=['ca', 'ca'],
                 stem_hidden_dim=128,
                 **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
    return model

xcit

mindcv.models.xcit

MindSpore implementation of XCiT Refer to: XCiT: Cross-Covariance Image Transformers

mindcv.models.xcit.ClassAttention

Bases: Cell

Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239

Source code in mindcv\models\xcit.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
class ClassAttention(nn.Cell):
    """Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239
    """

    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Dense(
            in_channels=dim, out_channels=dim * 3, has_bias=qkv_bias)
        self.attn_drop = Dropout(p=attn_drop)
        self.proj = nn.Dense(in_channels=dim, out_channels=dim)
        self.proj_drop = Dropout(p=proj_drop)
        self.softmax = nn.Softmax(axis=-1)

        self.attn_matmul_v = ops.BatchMatMul()

    def construct(self, x: Tensor) -> Tensor:
        B, N, C = x.shape

        qkv = self.qkv(x)
        qkv = ops.reshape(qkv, (B, N, 3, self.num_heads, C // self.num_heads))
        qkv = ops.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = ops.unstack(qkv, axis=0)
        qc = q[:, :, 0:1]
        attn_cls = (qc * k).sum(-1) * self.scale
        attn_cls = self.softmax(attn_cls)
        attn_cls = self.attn_drop(attn_cls)

        attn_cls = ops.expand_dims(attn_cls, 2)
        cls_tkn = self.attn_matmul_v(attn_cls, v)
        cls_tkn = ops.transpose(cls_tkn, (0, 2, 1, 3))
        cls_tkn = ops.reshape(cls_tkn, (B, 1, C))
        cls_tkn = self.proj(cls_tkn)
        x = ops.concat((self.proj_drop(cls_tkn), x[:, 1:]), axis=1)
        return x

mindcv.models.xcit.ClassAttentionBlock

Bases: Cell

Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239

Source code in mindcv\models\xcit.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
class ClassAttentionBlock(nn.Cell):
    """Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239
    """

    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.,
                 attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=None,
                 tokens_norm=False):
        super().__init__()
        self.norm1 = norm_layer([dim])
        self.attn = ClassAttention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
            proj_drop=drop
        )

        self.drop_path = DropPath(
            drop_path) if drop_path > 0. else ops.Identity()
        self.norm2 = norm_layer([dim])
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer,
                       drop=drop)

        # LayerScale Initialization (no layerscale when None)
        if eta is not None:
            self.gamma1 = Parameter(
                eta * ops.Ones()((dim), mstype.float32), requires_grad=True)
            self.gamma2 = Parameter(
                eta * ops.Ones()((dim), mstype.float32), requires_grad=True)
        else:
            self.gamma1, self.gamma2 = 1.0, 1.0

        # FIXME: A hack for models pre-trained with layernorm over all the tokens not just the CLS
        self.tokens_norm = tokens_norm

    def construct(self, x, H, W, mask=None):

        x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x)))

        if self.tokens_norm:
            x = self.norm2(x)
        else:
            x[:, 0:1] = self.norm2(x[:, 0:1])
        x_res = x
        cls_token = x[:, 0:1]
        cls_token = self.gamma2 * self.mlp(cls_token)
        x = ops.concat((cls_token, x[:, 1:]), axis=1)
        x = x_res + self.drop_path(x)
        return x

mindcv.models.xcit.ConvPatchEmbed

Bases: Cell

Image to Patch Embedding using multiple convolutional layers

Source code in mindcv\models\xcit.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class ConvPatchEmbed(nn.Cell):
    """ Image to Patch Embedding using multiple convolutional layers
    """

    def __init__(self,
                 img_size: int = 224,
                 patch_size: int = 16,
                 in_chans: int = 3,
                 embed_dim: int = 768
                 ) -> None:
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * \
            (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        if patch_size[0] == 16:
            self.proj = nn.SequentialCell([
                conv3x3(3, embed_dim // 8, 2),
                nn.GELU(),
                conv3x3(embed_dim // 8, embed_dim // 4, 2),
                nn.GELU(),
                conv3x3(embed_dim // 4, embed_dim // 2, 2),
                nn.GELU(),
                conv3x3(embed_dim // 2, embed_dim, 2),
            ])
        elif patch_size[0] == 8:
            self.proj = nn.SequentialCell([
                conv3x3(3, embed_dim // 4, 2),
                nn.GELU(),
                conv3x3(embed_dim // 4, embed_dim // 2, 2),
                nn.GELU(),
                conv3x3(embed_dim // 2, embed_dim, 2),
            ])
        else:
            raise ValueError(
                "For convolutional projection, patch size has to be in [8, 16]")

    def construct(self, x, padding_size=None) -> Tensor:
        x = self.proj(x)
        B, C, Hp, Wp = x.shape
        x = ops.reshape(x, (B, C, Hp * Wp))
        x = x.transpose(0, 2, 1)

        return x, (Hp, Wp)

mindcv.models.xcit.LPI

Bases: Cell

Local Patch Interaction module that allows explicit communication between tokens in 3x3 windows to augment the implicit communcation performed by the block diagonal scatter attention. Implemented using 2 layers of separable 3x3 convolutions with GeLU and BatchNorm2d

Source code in mindcv\models\xcit.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
class LPI(nn.Cell):
    """
    Local Patch Interaction module that allows explicit communication between tokens in 3x3 windows
    to augment the implicit communcation performed by the block diagonal scatter attention.
    Implemented using 2 layers of separable 3x3 convolutions with GeLU and BatchNorm2d
    """

    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
                 drop=0., kernel_size=3) -> None:
        super().__init__()
        out_features = out_features or in_features

        padding = kernel_size // 2

        self.conv1 = nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
                               padding=padding, pad_mode='pad', group=out_features, has_bias=True)
        self.act = act_layer()
        self.bn = nn.BatchNorm2d(in_features)
        self.conv2 = nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
                               padding=padding, pad_mode='pad', group=out_features, has_bias=True)

    def construct(self, x, H, W) -> Tensor:
        B, N, C = x.shape
        x = ops.reshape(ops.transpose(x, (0, 2, 1)), (B, C, H, W))
        x = self.conv1(x)
        x = self.act(x)
        x = self.bn(x)
        x = self.conv2(x)
        x = ops.transpose(ops.reshape(x, (B, C, N)), (0, 2, 1))

        return x

mindcv.models.xcit.PositionalEncodingFourier

Bases: Cell

Positional encoding relying on a fourier kernel matching the one used in the "Attention is all of Need" paper. The implementation builds on DeTR code https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py

Source code in mindcv\models\xcit.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
class PositionalEncodingFourier(nn.Cell):
    """
    Positional encoding relying on a fourier kernel matching the one used in the
    "Attention is all of Need" paper. The implementation builds on DeTR code
    https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
    """

    def __init__(self,
                 hidden_dim: int = 32,
                 dim: int = 768,
                 temperature=10000
                 ) -> None:
        super().__init__()
        self.token_projection = nn.Conv2d(
            hidden_dim * 2, dim, kernel_size=1, has_bias=True)
        self.scale = 2 * np.pi
        self.temperature = temperature
        self.hidden_dim = hidden_dim
        self.dim = dim

    def construct(self, B, H, W) -> Tensor:
        mask = Tensor(np.zeros((B, H, W)).astype(bool))
        not_mask = ~mask
        y_embed = not_mask.cumsum(1, dtype=mstype.float32)
        x_embed = not_mask.cumsum(2, dtype=mstype.float32)
        eps = 1e-6
        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

        dim_t = numpy.arange(self.hidden_dim, dtype=mstype.float32)
        dim_t = self.temperature ** (2 * (dim_t // 2) / self.hidden_dim)

        pos_x = x_embed[:, :, :, None] / dim_t
        pos_y = y_embed[:, :, :, None] / dim_t
        pos_x = ops.stack((ops.sin(pos_x[:, :, :, 0::2]),
                           ops.cos(pos_x[:, :, :, 1::2])), 4)
        x1, x2, x3, x4, x5 = pos_x.shape
        pos_x = ops.reshape(pos_x, (x1, x2, x3, x4 * x5))
        pos_y = ops.stack((ops.sin(pos_y[:, :, :, 0::2]),
                           ops.cos(pos_y[:, :, :, 1::2])), 4)
        y1, y2, y3, y4, y5 = pos_y.shape
        pos_y = ops.reshape(pos_y, (y1, y2, y3, y4 * y5))
        pos = ops.transpose(ops.concat((pos_y, pos_x), 3), (0, 3, 1, 2))
        pos = self.token_projection(pos)
        return pos

mindcv.models.xcit.XCA

Bases: Cell

Cross-Covariance Attention (XCA) operation where the channels are updated using a weighted sum. The weights are obtained from the (softmax normalized) Cross-covariance matrix (Q^T K \in d_h \times d_h)

Source code in mindcv\models\xcit.py
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
class XCA(nn.Cell):

    """ Cross-Covariance Attention (XCA) operation where the channels are updated using a weighted
     sum. The weights are obtained from the (softmax normalized) Cross-covariance
    matrix (Q^T K \\in d_h \\times d_h)
    """

    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        self.temperature = Parameter(
            ops.Ones()((num_heads, 1, 1), mstype.float32))
        self.qkv = nn.Dense(
            in_channels=dim, out_channels=dim * 3, has_bias=qkv_bias)
        self.q_matmul_k = ops.BatchMatMul(transpose_b=True)
        self.softmax = nn.Softmax(axis=-1)
        self.attn_drop = Dropout(p=attn_drop)
        self.attn_matmul_v = ops.BatchMatMul()
        self.proj = nn.Dense(in_channels=dim, out_channels=dim)
        self.proj_drop = Dropout(p=proj_drop)

    def construct(self, x):
        B, N, C = x.shape

        qkv = ops.reshape(
            self.qkv(x), (B, N, 3, self.num_heads, C // self.num_heads))
        qkv = ops.transpose(qkv, (2, 0, 3, 1, 4))
        q, k, v = ops.unstack(qkv, axis=0)

        q = ops.transpose(q, (0, 1, 3, 2))
        k = ops.transpose(k, (0, 1, 3, 2))
        v = ops.transpose(v, (0, 1, 3, 2))

        attn = self.q_matmul_k(q, k) * self.temperature
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)
        x = self.attn_matmul_v(attn, v)
        x = ops.transpose(x, (0, 3, 1, 2))
        x = ops.reshape(x, (B, N, C))
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

mindcv.models.xcit.XCiT

Bases: Cell

XCiT model class, based on "XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/abs/2106.09681>_ Args: img_size (int, tuple): input image size patch_size (int, tuple): patch size in_chans (int): number of input channels num_classes (int): number of classes for classification head embed_dim (int): embedding dimension depth (int): depth of transformer num_heads (int): number of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True qk_scale (float): override default qk scale of head_dim ** -0.5 if set drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate norm_layer: (nn.Module): normalization layer cls_attn_layers: (int) Depth of Class attention layers use_pos: (bool) whether to use positional encoding eta: (float) layerscale initialization value tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA

Source code in mindcv\models\xcit.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
class XCiT(nn.Cell):
    r"""XCiT model class, based on
    `"XCiT: Cross-Covariance Image Transformers" <https://arxiv.org/abs/2106.09681>`_
    Args:
        img_size (int, tuple): input image size
        patch_size (int, tuple): patch size
        in_chans (int): number of input channels
        num_classes (int): number of classes for classification head
        embed_dim (int): embedding dimension
        depth (int): depth of transformer
        num_heads (int): number of attention heads
        mlp_ratio (int): ratio of mlp hidden dim to embedding dim
        qkv_bias (bool): enable bias for qkv if True
        qk_scale (float): override default qk scale of head_dim ** -0.5 if set
        drop_rate (float): dropout rate
        attn_drop_rate (float): attention dropout rate
        drop_path_rate (float): stochastic depth rate
        norm_layer: (nn.Module): normalization layer
        cls_attn_layers: (int) Depth of Class attention layers
        use_pos: (bool) whether to use positional encoding
        eta: (float) layerscale initialization value
        tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA
    """

    def __init__(self,
                 img_size: int = 224,
                 patch_size: int = 16,
                 in_chans: int = 3,
                 num_classes: int = 1000,
                 embed_dim: int = 768,
                 depth: int = 12,
                 num_heads: int = 12,
                 mlp_ratio: int = 4.,
                 qkv_bias: bool = True,
                 qk_scale: float = None,
                 drop_rate: float = 0.,
                 attn_drop_rate: float = 0.,
                 drop_path_rate: float = 0.,
                 norm_layer: nn.Cell = None,
                 cls_attn_layers: int = 2,
                 use_pos: bool = True,
                 patch_proj: str = 'linear',
                 eta: float = None,
                 tokens_norm: bool = False):
        super().__init__()

        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim
        norm_layer = norm_layer or partial(nn.LayerNorm, epsilon=1e-6)

        self.patch_embed = ConvPatchEmbed(img_size=img_size, embed_dim=embed_dim,
                                          patch_size=patch_size)

        num_patches = self.patch_embed.num_patches

        self.cls_token = Parameter(
            ops.zeros((1, 1, embed_dim), mstype.float32))
        self.pos_drop = Dropout(p=drop_rate)

        dpr = [drop_path_rate for i in range(depth)]
        self.blocks = nn.CellList([
            XCABlock(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
                norm_layer=norm_layer, num_tokens=num_patches, eta=eta)
            for i in range(depth)])

        self.cls_attn_blocks = nn.CellList([
            ClassAttentionBlock(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer,
                eta=eta, tokens_norm=tokens_norm)
            for i in range(cls_attn_layers)])
        self.norm = norm_layer([embed_dim])
        self.head = nn.Dense(
            in_channels=embed_dim, out_channels=num_classes) if num_classes > 0 else ops.Identity()

        self.pos_embeder = PositionalEncodingFourier(dim=embed_dim)
        self.use_pos = use_pos

        # Classifier head
        self.cls_token.set_data(weight_init.initializer(weight_init.TruncatedNormal(sigma=0.02),
                                                        self.cls_token.shape,
                                                        self.cls_token.dtype))
        self._init_weights()

    def _init_weights(self) -> None:
        for name, m in self.cells_and_names():
            if isinstance(m, nn.Dense):
                m.weight = weight_init.initializer(weight_init.TruncatedNormal(
                    sigma=0.02), m.weight.shape, mindspore.float32)
                if m.bias is not None:
                    m.bias.set_data(weight_init.initializer(
                        weight_init.Constant(0), m.bias.shape))
            elif isinstance(m, nn.LayerNorm):
                m.beta.set_data(weight_init.initializer(
                    weight_init.Constant(0), m.beta.shape))
                m.gamma.set_data(weight_init.initializer(
                    weight_init.Constant(1), m.gamma.shape))

    def forward_features(self, x):
        B, C, H, W = x.shape
        x, (Hp, Wp) = self.patch_embed(x)
        if self.use_pos:
            pos_encoding = self.pos_embeder(B, Hp, Wp).reshape(
                B, -1, x.shape[1]).transpose(0, 2, 1)
            x = x + pos_encoding
        x = self.pos_drop(x)
        for blk in self.blocks:
            x = blk(x, Hp, Wp)
        cls_tokens = ops.broadcast_to(self.cls_token, (B, -1, -1))
        cls_tokens = ops.cast(cls_tokens, x.dtype)
        x = ops.concat((cls_tokens, x), 1)

        for blk in self.cls_attn_blocks:
            x = blk(x, Hp, Wp)
        return self.norm(x)[:, 0]

    def construct(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

mindcv.models.xcit.conv3x3(in_planes, out_planes, stride=1)

3x3 convolution with padding

Source code in mindcv\models\xcit.py
92
93
94
95
96
97
98
99
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.SequentialCell([
        nn.Conv2d(
            in_planes, out_planes, kernel_size=3, stride=stride, padding=1, pad_mode='pad', has_bias=False
        ),
        nn.BatchNorm2d(out_planes)
    ])

mindcv.models.xcit.xcit_tiny_12_p16_224(pretrained=False, num_classes=1000, in_channels=3, **kwargs)

Get xcit_tiny_12_p16_224 model. Refer to the base class 'models.XCiT' for more details.

Source code in mindcv\models\xcit.py
478
479
480
481
482
483
484
485
486
487
488
489
490
491
@register_model
def xcit_tiny_12_p16_224(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> XCiT:
    """Get xcit_tiny_12_p16_224 model.
    Refer to the base class 'models.XCiT' for more details.
    """
    default_cfg = default_cfgs['xcit_tiny_12_p16_224']
    model = XCiT(
        patch_size=16, num_classes=num_classes, embed_dim=192, depth=12, num_heads=4, mlp_ratio=4, qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, epsilon=1e-6), eta=1.0, tokens_norm=True, **kwargs)
    if pretrained:
        load_pretrained(model, default_cfg,
                        num_classes=num_classes, in_channels=in_channels)

    return model